7 35 35 35 35 35 35 35 35 35 35 35 33 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <linux/bitmap.h> #include "netlink.h" #include "bitset.h" /* Some bitmaps are internally represented as an array of unsigned long, some * as an array of u32 (some even as single u32 for now). To avoid the need of * wrappers on caller side, we provide two set of functions: those with "32" * suffix in their names expect u32 based bitmaps, those without it expect * unsigned long bitmaps. */ static u32 ethnl_lower_bits(unsigned int n) { return ~(u32)0 >> (32 - n % 32); } static u32 ethnl_upper_bits(unsigned int n) { return ~(u32)0 << (n % 32); } /** * ethnl_bitmap32_clear() - Clear u32 based bitmap * @dst: bitmap to clear * @start: beginning of the interval * @end: end of the interval * @mod: set if bitmap was modified * * Clear @nbits bits of a bitmap with indices @start <= i < @end */ static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, bool *mod) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; unsigned int i; u32 mask; if (end <= start) return; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } return; } if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } start_word++; } for (i = start_word; i < end_word; i++) { if (dst[i]) { dst[i] = 0; *mod = true; } } if (end % 32) { mask = ethnl_lower_bits(end); if (dst[end_word] & mask) { dst[end_word] &= ~mask; *mod = true; } } } /** * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval * @map: bitmap to test * @start: beginning of the interval * @end: end of the interval * * Return: true if there is non-zero bit with index @start <= i < @end, * false if the whole interval is zero */ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, unsigned int end) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; u32 mask; if (end <= start) return true; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); return map[start_word] & mask; } if (map[start_word] & mask) return true; start_word++; } if (!memchr_inv(map + start_word, '\0', (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) return true; return map[end_word] & ethnl_lower_bits(end); } /** * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask * pair * @dst: bitmap to update * @nbits: bit size of the bitmap * @value: values to set * @mask: mask of bits to set * @mod: set to true if bitmap is modified, preserve if not * * Set bits in @dst bitmap which are set in @mask to values from @value, leave * the rest untouched. If destination bitmap was modified, set @mod to true, * leave as it is if not. */ static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, const u32 *value, const u32 *mask, bool *mod) { while (nbits > 0) { u32 real_mask = mask ? *mask : ~(u32)0; u32 new_value; if (nbits < 32) real_mask &= ethnl_lower_bits(nbits); new_value = (*dst & ~real_mask) | (*value & real_mask); if (new_value != *dst) { *dst = new_value; *mod = true; } if (nbits <= 32) break; dst++; nbits -= 32; value++; if (mask) mask++; } } static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) { return map[index / 32] & (1U << (index % 32)); } /** * ethnl_bitset32_size() - Calculate size of bitset nested attribute * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: assume compact format for output * * Estimate length of netlink attribute composed by a later call to * ethnl_put_bitset32() call with the same arguments. * * Return: negative error code or attribute length estimate */ int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { unsigned int len = 0; /* list flag */ if (!mask) len += nla_total_size(sizeof(u32)); /* size */ len += nla_total_size(sizeof(u32)); if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); /* value, mask */ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); } else { unsigned int bits_len = 0; unsigned int bit_len, i; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; /* index */ bit_len = nla_total_size(sizeof(u32)); /* name */ if (name) bit_len += ethnl_strz_size(name); /* value */ if (mask && ethnl_bitmap32_test_bit(val, i)) bit_len += nla_total_size(0); /* bit nest */ bits_len += nla_total_size(bit_len); } /* bits nest */ len += nla_total_size(bits_len); } /* outermost nest */ return nla_total_size(len); } /** * ethnl_put_bitset32() - Put a bitset nest into a message * @skb: skb with the message * @attrtype: attribute type for the bitset nest * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: use compact format for the output * * Compose a nested attribute representing a bitset. If @mask is null, simple * bitmap (bit list) is created, if @mask is provided, represent a value/mask * pair. Bit names are only used in verbose mode and when provided by calller. * * Return: 0 on success, negative error value on error */ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { struct nlattr *nest; struct nlattr *attr; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) goto nla_put_failure; if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); u32 *dst; attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, val, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); if (mask) { attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, mask, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); } } else { struct nlattr *bits; unsigned int i; bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); if (!bits) goto nla_put_failure; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); if (!attr) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) goto nla_put_failure; if (name && ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) goto nla_put_failure; if (mask && ethnl_bitmap32_test_bit(val, i) && nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) goto nla_put_failure; nla_nest_end(skb, attr); } nla_nest_end(skb, bits); } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct nla_policy bitset_policy[] = { [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, }; static const struct nla_policy bit_policy[] = { [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, }; /** * ethnl_bitset_is_compact() - check if bitset attribute represents a compact * bitset * @bitset: nested attribute representing a bitset * @compact: pointer for return value * * Return: 0 on success, negative error code on failure */ int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset, bitset_policy, NULL); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) { if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) return -EINVAL; *compact = false; return 0; } if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) return -EINVAL; *compact = true; return 0; } /** * ethnl_name_to_idx() - look up string index for a name * @names: array of ETH_GSTRING_LEN sized strings * @n_names: number of strings in the array * @name: name to look up * * Return: index of the string if found, -ENOENT if not found */ static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, const char *name) { unsigned int i; if (!names) return -ENOENT; for (i = 0; i < n_names; i++) { /* names[i] may not be null terminated */ if (!strncmp(names[i], name, ETH_GSTRING_LEN) && strlen(name) <= ETH_GSTRING_LEN) return i; } return -ENOENT; } static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, const struct nlattr *bit_attr, bool no_mask, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bit_policy)]; int ret, idx; ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr, bit_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { const char *name; idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); if (idx >= nbits) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_INDEX], "bit index too high"); return -EOPNOTSUPP; } name = names ? names[idx] : NULL; if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "bit index and name mismatch"); return -EINVAL; } } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { idx = ethnl_name_to_idx(names, nbits, nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); if (idx < 0) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_NAME], "bit name not found"); return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "neither bit index nor name specified"); return -EINVAL; } *index = idx; *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; return 0; } static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (no_mask) ethnl_bitmap32_clear(bitmap, 0, nbits, mod); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; unsigned int idx; if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); *mod = true; } } return 0; } static int ethnl_compact_sanity_checks(unsigned int nbits, const struct nlattr *nest, struct nlattr **tb, struct netlink_ext_ack *extack) { bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; unsigned int attr_nbits, attr_nwords; const struct nlattr *test_attr; if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask not allowed in list bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_SIZE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing size in compact bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing value in compact bitset"); return -EINVAL; } if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing mask in compact nonlist bitset"); return -EINVAL; } attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); attr_nwords = DIV_ROUND_UP(attr_nbits, 32); if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "bitset value length does not match size"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK] && nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "bitset mask length does not match size"); return -EINVAL; } if (attr_nbits <= nbits) return 0; test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : tb[ETHTOOL_A_BITSET_MASK]; if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { NL_SET_ERR_MSG_ATTR(extack, test_attr, "cannot modify bits past kernel bitset size"); return -EINVAL; } return 0; } /** * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap * @bitmap: bitmap to update * @nbits: size of the updated bitmap in bits * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * @mod: set this to true if bitmap is modified, leave as it is if not * * Apply bitset netsted attribute to a bitmap. If the attribute represents * a bit list, @bitmap is set to its contents; otherwise, bits in mask are * set to values from value. Bitmaps in the attribute may be longer than * @nbits but the message must not request modifying any bits past @nbits. * * Return: negative error code on failure, 0 on success */ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; unsigned int change_bits; bool no_mask; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, names, extack, mod); ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; change_bits = min_t(unsigned int, nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); ethnl_bitmap32_update(bitmap, change_bits, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), no_mask ? NULL : nla_data(tb[ETHTOOL_A_BITSET_MASK]), mod); if (no_mask && change_bits < nbits) ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); return 0; } /** * ethnl_parse_bitset() - Compute effective value and mask from bitset nest * @val: unsigned long based bitmap to put value into * @mask: unsigned long based bitmap to put mask into * @nbits: size of @val and @mask bitmaps * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * * Provide @nbits size long bitmaps for value and mask so that * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x * the same way ethnl_update_bitset() with the same bitset attribute would. * * Return: negative error code on failure, 0 on success */ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; const struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (!tb[ETHTOOL_A_BITSET_BITS]) { unsigned int change_bits; ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); if (change_bits > nbits) change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) bitmap_clear(val, change_bits, nbits - change_bits); if (no_mask) { bitmap_fill(mask, nbits); } else { bitmap_from_arr32(mask, nla_data(tb[ETHTOOL_A_BITSET_MASK]), change_bits); if (change_bits < nbits) bitmap_clear(mask, change_bits, nbits - change_bits); } return 0; } if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } bitmap_zero(val, nbits); if (no_mask) bitmap_fill(mask, nbits); else bitmap_zero(mask, nbits); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { unsigned int idx; bool bit_val; ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; if (bit_val) __set_bit(idx, val); if (!no_mask) __set_bit(idx, mask); } return 0; } #if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) /* 64-bit big endian architectures are the only case when u32 based bitmaps * and unsigned long based bitmaps have different memory layout so that we * cannot simply cast the latter to the former and need actual wrappers * converting the latter to the former. * * To reduce the number of slab allocations, the wrappers use fixed size local * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the * majority of bitmaps used by ethtool. */ #define ETHNL_SMALL_BITMAP_BITS 128 #define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; u32 *bitmap32 = small_bitmap32; bool u32_mod = false; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int dst_words = DIV_ROUND_UP(nbits, 32); bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); if (!bitmap32) return -ENOMEM; } bitmap_to_arr32(bitmap32, bitmap, nbits); ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, &u32_mod); if (u32_mod) { bitmap_from_arr32(bitmap, bitmap32, nbits); *mod = true; } if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(bitmap32); return ret; } #else /* On little endian 64-bit and all 32-bit architectures, an unsigned long * based bitmap can be interpreted as u32 based one using a simple cast. */ int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, mod); } #endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */ |
30 30 29 61 61 61 58 53 53 53 53 23 23 21 23 23 21 30 29 29 29 29 30 30 30 30 30 30 30 51 53 53 45 51 51 53 53 51 51 52 51 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 52 53 53 51 53 51 51 53 53 53 51 51 51 50 51 53 61 4 4 59 57 4 3 2 61 61 58 58 55 55 55 55 54 53 53 53 53 51 55 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "ext4.h" #include <linux/fsmap.h> #include "fsmap.h" #include "mballoc.h" #include <linux/sort.h> #include <linux/list_sort.h> #include <trace/events/ext4.h> /* Convert an ext4_fsmap to an fsmap. */ void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, struct ext4_fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; dest->fmr_owner = src->fmr_owner; dest->fmr_offset = 0; dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; dest->fmr_reserved[0] = 0; dest->fmr_reserved[1] = 0; dest->fmr_reserved[2] = 0; } /* Convert an fsmap to an ext4_fsmap. */ void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, struct fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; dest->fmr_owner = src->fmr_owner; dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; } /* getfsmap query state */ struct ext4_getfsmap_info { struct ext4_fsmap_head *gfi_head; ext4_fsmap_format_t gfi_formatter; /* formatting fn */ void *gfi_format_arg;/* format buffer */ ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ u32 gfi_dev; /* device id */ ext4_group_t gfi_agno; /* bg number, if applicable */ struct ext4_fsmap gfi_low; /* low rmap key */ struct ext4_fsmap gfi_high; /* high rmap key */ struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ struct list_head gfi_meta_list; /* fixed metadata list */ bool gfi_last; /* last extent? */ }; /* Associate a device with a getfsmap handler. */ struct ext4_getfsmap_dev { int (*gfd_fn)(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info); u32 gfd_dev; }; /* Compare two getfsmap device handlers. */ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) { const struct ext4_getfsmap_dev *d1 = p1; const struct ext4_getfsmap_dev *d2 = p2; return d1->gfd_dev - d2->gfd_dev; } /* Compare a record against our starting point */ static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { return rec->fmr_physical < info->gfi_low.fmr_physical; } /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. */ static int ext4_getfsmap_helper(struct super_block *sb, struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { struct ext4_fsmap fmr; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t rec_fsblk = rec->fmr_physical; ext4_group_t agno; ext4_grpblk_t cno; int error; if (fatal_signal_pending(current)) return -EINTR; /* * Filter out records that start before our startpoint, if the * caller requested that. */ if (ext4_getfsmap_rec_before_low_key(info, rec)) { rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } /* Are we just counting mappings? */ if (info->gfi_head->fmh_count == 0) { if (info->gfi_head->fmh_entries == UINT_MAX) return EXT4_QUERY_RANGE_ABORT; if (rec_fsblk > info->gfi_next_fsblk) info->gfi_head->fmh_entries++; if (info->gfi_last) return EXT4_QUERY_RANGE_CONTINUE; info->gfi_head->fmh_entries++; rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } /* * If the record starts past the last physical block we saw, * then we've found a gap. Report the gap as being owned by * whatever the caller specified is the missing owner. */ if (rec_fsblk > info->gfi_next_fsblk) { if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) return EXT4_QUERY_RANGE_ABORT; ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, &agno, &cno); trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), rec_fsblk - info->gfi_next_fsblk, EXT4_FMR_OWN_UNKNOWN); fmr.fmr_device = info->gfi_dev; fmr.fmr_physical = info->gfi_next_fsblk; fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; error = info->gfi_formatter(&fmr, info->gfi_format_arg); if (error) return error; info->gfi_head->fmh_entries++; } if (info->gfi_last) goto out; /* Fill out the extent we found */ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) return EXT4_QUERY_RANGE_ABORT; ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), rec->fmr_length, rec->fmr_owner); fmr.fmr_device = info->gfi_dev; fmr.fmr_physical = rec_fsblk; fmr.fmr_owner = rec->fmr_owner; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; fmr.fmr_length = rec->fmr_length; error = info->gfi_formatter(&fmr, info->gfi_format_arg); if (error) return error; info->gfi_head->fmh_entries++; out: rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) { return fmr->fmr_physical + fmr->fmr_length; } /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t len, void *priv) { struct ext4_fsmap irec; struct ext4_getfsmap_info *info = priv; struct ext4_fsmap *p; struct ext4_fsmap *tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t fsb; ext4_fsblk_t fslen; int error; fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); fslen = EXT4_C2B(sbi, len); /* If the retained free extent record is set... */ if (info->gfi_lastfree.fmr_owner) { /* ...and abuts this one, lengthen it and return. */ if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { info->gfi_lastfree.fmr_length += fslen; return 0; } /* * There's a gap between the two free extents; emit the * retained extent prior to merging the meta_list. */ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); if (error) return error; info->gfi_lastfree.fmr_owner = 0; } /* Merge in any relevant extents from the meta_list */ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { list_del(&p->fmr_list); kfree(p); } else if (p->fmr_physical < fsb) { error = ext4_getfsmap_helper(sb, info, p); if (error) return error; list_del(&p->fmr_list); kfree(p); } } irec.fmr_device = 0; irec.fmr_physical = fsb; irec.fmr_length = fslen; irec.fmr_owner = EXT4_FMR_OWN_FREE; irec.fmr_flags = 0; /* If this is a free extent at the end of a bg, buffer it. */ if (ext4_fsmap_next_pblk(&irec) == ext4_group_first_block_no(sb, agno + 1)) { info->gfi_lastfree = irec; return 0; } /* Otherwise, emit it */ return ext4_getfsmap_helper(sb, info, &irec); } /* Execute a getfsmap query against the log device. */ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info) { journal_t *journal = EXT4_SB(sb)->s_journal; struct ext4_fsmap irec; /* Set up search keys */ info->gfi_low = keys[0]; info->gfi_low.fmr_length = 0; memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, info->gfi_low.fmr_physical, info->gfi_low.fmr_length, info->gfi_low.fmr_owner); trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, info->gfi_high.fmr_physical, info->gfi_high.fmr_length, info->gfi_high.fmr_owner); if (keys[0].fmr_physical > 0) return 0; /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; return ext4_getfsmap_helper(sb, info, &irec); } /* Helper to fill out an ext4_fsmap. */ static inline int ext4_getfsmap_fill(struct list_head *meta_list, ext4_fsblk_t fsb, ext4_fsblk_t len, uint64_t owner) { struct ext4_fsmap *fsm; fsm = kmalloc(sizeof(*fsm), GFP_NOFS); if (!fsm) return -ENOMEM; fsm->fmr_device = 0; fsm->fmr_flags = 0; fsm->fmr_physical = fsb; fsm->fmr_owner = owner; fsm->fmr_length = len; list_add_tail(&fsm->fmr_list, meta_list); return 0; } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, ext4_group_t agno, struct list_head *meta_list) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); ext4_fsblk_t len; unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); int error; /* Record the superblock. */ if (ext4_bg_has_super(sb, agno)) { error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); if (error) return error; fsb++; } /* Record the group descriptors. */ len = ext4_bg_num_gdb(sb, agno); if (!len) return 0; error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_GDT); if (error) return error; fsb += len; /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) return error; } return 0; } /* Compare two fsmap items. */ static int ext4_getfsmap_compare(void *priv, const struct list_head *a, const struct list_head *b) { struct ext4_fsmap *fa; struct ext4_fsmap *fb; fa = container_of(a, struct ext4_fsmap, fmr_list); fb = container_of(b, struct ext4_fsmap, fmr_list); if (fa->fmr_physical < fb->fmr_physical) return -1; else if (fa->fmr_physical > fb->fmr_physical) return 1; return 0; } /* Merge adjacent extents of fixed metadata. */ static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) { struct ext4_fsmap *p; struct ext4_fsmap *prev = NULL; struct ext4_fsmap *tmp; list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { if (!prev) { prev = p; continue; } if (prev->fmr_owner == p->fmr_owner && prev->fmr_physical + prev->fmr_length == p->fmr_physical) { prev->fmr_length += p->fmr_length; list_del(&p->fmr_list); kfree(p); } else prev = p; } } /* Free a list of fixed metadata. */ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) { struct ext4_fsmap *p; struct ext4_fsmap *tmp; list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { list_del(&p->fmr_list); kfree(p); } } /* Find all the fixed metadata in the filesystem. */ static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; int error; INIT_LIST_HEAD(meta_list); /* Collect everything. */ for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { gdp = ext4_get_group_desc(sb, agno, NULL); if (!gdp) { error = -EFSCORRUPTED; goto err; } /* Superblock & GDT */ error = ext4_getfsmap_find_sb(sb, agno, meta_list); if (error) goto err; /* Block bitmap */ error = ext4_getfsmap_fill(meta_list, ext4_block_bitmap(sb, gdp), 1, EXT4_FMR_OWN_BLKBM); if (error) goto err; /* Inode bitmap */ error = ext4_getfsmap_fill(meta_list, ext4_inode_bitmap(sb, gdp), 1, EXT4_FMR_OWN_INOBM); if (error) goto err; /* Inodes */ error = ext4_getfsmap_fill(meta_list, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group, EXT4_FMR_OWN_INODES); if (error) goto err; } /* Sort the list */ list_sort(NULL, meta_list, ext4_getfsmap_compare); /* Merge adjacent extents */ ext4_getfsmap_merge_fixed_metadata(meta_list); return 0; err: ext4_getfsmap_free_fixed_metadata(meta_list); return error; } /* Execute a getfsmap query against the buddy bitmaps */ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; else if (keys[0].fmr_physical < bofs) keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; if (keys[1].fmr_physical < keys[0].fmr_physical) return 0; start_fsb = keys[0].fmr_physical; end_fsb = keys[1].fmr_physical; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); /* * Convert the fsmap low/high keys to bg based keys. Initialize * low to the fsmap low key and max out the high key to the end * of the bg. */ info->gfi_low = keys[0]; info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); info->gfi_low.fmr_length = 0; memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); /* Assemble a list of all the fixed-location metadata. */ error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); if (error) goto err; /* Query each bg */ for (info->gfi_agno = start_ag; info->gfi_agno <= end_ag; info->gfi_agno++) { /* * Set the bg high key from the fsmap high key if this * is the last bg that we're querying. */ if (info->gfi_agno == end_ag) { info->gfi_high = keys[1]; info->gfi_high.fmr_physical = EXT4_C2B(sbi, last_cluster); info->gfi_high.fmr_length = 0; } trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, info->gfi_low.fmr_physical, info->gfi_low.fmr_length, info->gfi_low.fmr_owner); trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, info->gfi_high.fmr_physical, info->gfi_high.fmr_length, info->gfi_high.fmr_owner); error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), ext4_getfsmap_datadev_helper, info); if (error) goto err; /* * Set the bg low key to the start of the bg prior to * moving on to the next bg. */ if (info->gfi_agno == start_ag) memset(&info->gfi_low, 0, sizeof(info->gfi_low)); } /* Do we have a retained free extent? */ if (info->gfi_lastfree.fmr_owner) { error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); if (error) goto err; } /* Report any gaps at the end of the bg */ info->gfi_last = true; error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); if (error) goto err; err: ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); return error; } /* Do we recognize the device? */ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, struct ext4_fsmap *fm) { if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; if (EXT4_SB(sb)->s_journal_bdev && fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) return true; return false; } /* Ensure that the low key is less than the high key. */ static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, struct ext4_fsmap *high_key) { if (low_key->fmr_device > high_key->fmr_device) return false; if (low_key->fmr_device < high_key->fmr_device) return true; if (low_key->fmr_physical > high_key->fmr_physical) return false; if (low_key->fmr_physical < high_key->fmr_physical) return true; if (low_key->fmr_owner > high_key->fmr_owner) return false; if (low_key->fmr_owner < high_key->fmr_owner) return true; return false; } #define EXT4_GETFSMAP_DEVS 2 /* * Get filesystem's extents as described in head, and format for * output. Calls formatter to fill the user's buffer until all * extents are mapped, until the passed-in head->fmh_count slots have * been filled, or until the formatter short-circuits the loop, if it * is tracking filled-in extents on its own. * * Key to Confusion * ---------------- * There are multiple levels of keys and counters at work here: * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; * these reflect fs-wide block addrs. * dkeys -- fmh_keys used to query each device; * these are fmh_keys but w/ the low key * bumped up by fmr_length. * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this * is how we detect gaps in the fsmap * records and report them. * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from * dkeys; used to query the free space. */ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, ext4_fsmap_format_t formatter, void *arg) { struct ext4_fsmap dkeys[2]; /* per-dev keys */ struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; struct ext4_getfsmap_info info = { NULL }; int i; int error = 0; if (head->fmh_iflags & ~FMH_IF_VALID) return -EINVAL; if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) return -EINVAL; head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; if (EXT4_SB(sb)->s_journal_bdev) { handlers[1].gfd_dev = new_encode_dev( EXT4_SB(sb)->s_journal_bdev->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), ext4_getfsmap_dev_compare, NULL); /* * To continue where we left off, we allow userspace to use the * last mapping from a previous call as the low key of the next. * This is identified by a non-zero length in the low key. We * have to increment the low key in this scenario to ensure we * don't return the same mapping again, and instead return the * very next mapping. * * Bump the physical offset as there can be no other mapping for * the same physical block range. */ dkeys[0] = head->fmh_keys[0]; dkeys[0].fmr_physical += dkeys[0].fmr_length; dkeys[0].fmr_owner = 0; dkeys[0].fmr_length = 0; memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) return -EINVAL; info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; info.gfi_formatter = formatter; info.gfi_format_arg = arg; info.gfi_head = head; /* For each device we support... */ for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ if (!handlers[i].gfd_fn) continue; if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) continue; if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) break; /* * If this device number matches the high key, we have * to pass the high key to the handler to limit the * query results. If the device number exceeds the * low key, zero out the low key so that we get * everything from the beginning. */ if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) dkeys[1] = head->fmh_keys[1]; if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); info.gfi_dev = handlers[i].gfd_dev; info.gfi_last = false; info.gfi_agno = -1; error = handlers[i].gfd_fn(sb, dkeys, &info); if (error) break; info.gfi_next_fsblk = 0; } head->fmh_oflags = FMH_OF_DEV_T; return error; } |
2 2 2 2 2 2 2 1488 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN * * This implementation does not provide ISO-TP specific return values to the * userspace. * * - RX path timeout of data reception leads to -ETIMEDOUT * - RX path SN mismatch leads to -EILSEQ * - RX path data reception with wrong padding leads to -EBADMSG * - TX path flowcontrol reception timeout leads to -ECOMM * - TX path flowcontrol reception overflow leads to -EMSGSIZE * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG * - when a transfer (tx) is on the run the next write() blocks until it's done * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); #define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp) #define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) /* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can * take full 32 bit values (4 Gbyte). We would need some good concept to handle * this between user space and kernel space. For now set the static buffer to * something about 8 kbyte to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 /* maximum PDU size before ISO 15765-2:2016 extension was 4095 */ #define MAX_12BIT_PDU_SIZE 4095 /* limit the isotp pdu size from the optional module parameter to 1MByte */ #define MAX_PDU_SIZE (1025 * 1024U) static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE; module_param(max_pdu_size, uint, 0444); MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default " __stringify(DEFAULT_MAX_PDU_SIZE) ")"); /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ #define N_PCI_FF 0x10 /* first frame */ #define N_PCI_CF 0x20 /* consecutive frame */ #define N_PCI_FC 0x30 /* flow control */ #define N_PCI_SZ 1 /* size of the PCI byte #1 */ #define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ #define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ #define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ #define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) #define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ #define ISOTP_FC_TIMEOUT 1 /* 1 sec */ #define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, ISOTP_SENDING, ISOTP_SHUTDOWN, }; struct tpcon { u8 *buf; unsigned int buflen; unsigned int len; unsigned int idx; u32 state; u8 bs; u8 sn; u8 ll_dl; u8 sbuf[DEFAULT_MAX_PDU_SIZE]; }; struct isotp_sock { struct sock sk; int bound; int ifindex; canid_t txid; canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); static DEFINE_SPINLOCK(isotp_notifier_lock); static struct isotp_sock *isotp_busy_notifier; static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; } static u32 isotp_bc_flags(struct isotp_sock *so) { return so->opt.flags & ISOTP_ALL_BC_FLAGS; } static bool isotp_register_rxid(struct isotp_sock *so) { /* no broadcast modes => register rx_id for FC frame reception */ return (isotp_bc_flags(so) == 0); } static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, rxtimer); struct sock *sk = &so->sk; if (so->rx.state == ISOTP_WAIT_DATA) { /* we did not get new data frames in time */ /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; } return HRTIMER_NORESTART; } static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); if (!nskb) return 1; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } can_skb_reserve(nskb); can_skb_prv(nskb)->ifindex = dev->ifindex; can_skb_prv(nskb)->skbcnt = 0; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; if (so->opt.flags & CAN_ISOTP_TX_PADDING) { memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); ncf->len = CAN_MAX_DLEN; } else { ncf->len = ae + FC_CONTENT_SZ; } ncf->data[ae] = N_PCI_FC | flowstatus; ncf->data[ae + 1] = so->rxfc.bs; ncf->data[ae + 2] = so->rxfc.stmin; if (ae) ncf->data[0] = so->opt.ext_address; ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); dev_put(dev); /* reset blocksize counter */ so->rx.bs = 0; /* reset last CF frame rx timestamp for rx stmin enforcement */ so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static u8 padlen(u8 datalen) { static const u8 plen[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ 12, 12, 12, 12, /* 9 - 12 */ 16, 16, 16, 16, /* 13 - 16 */ 20, 20, 20, 20, /* 17 - 20 */ 24, 24, 24, 24, /* 21 - 24 */ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ }; if (datalen > 48) return 64; return plen[datalen]; } /* check for length optimization and return 1/true when the check fails */ static int check_optimized(struct canfd_frame *cf, int start_index) { /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the * padding would start at this point. E.g. if the padding would * start at cf.data[7] cf->len has to be 7 to be optimal. * Note: The data[] index starts with zero. */ if (cf->len <= CAN_MAX_DLEN) return (cf->len != start_index); /* This relation is also valid in the non-linear DLC range, where * we need to take care of the minimal next possible CAN_DL. * The correct check would be (padlen(cf->len) != padlen(start_index)). * But as cf->len can only take discrete values from 12, .., 64 at this * point the padlen(cf->len) is always equal to cf->len. */ return (cf->len != padlen(start_index)); } /* check padding and return 1/true when the check fails */ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, int start_index, u8 content) { int i; /* no RX_PADDING value => check length of optimized frame length */ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) return check_optimized(cf, start_index); /* no valid test against empty value => ignore frame */ return 1; } /* check datalength of correctly padded CAN frame */ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && cf->len != padlen(cf->len)) return 1; /* check padding content */ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { for (i = start_index; i < cf->len; i++) if (cf->data[i] != content) return 1; } return 0; } static void isotp_send_cframe(struct isotp_sock *so); static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; if (so->tx.state != ISOTP_WAIT_FC && so->tx.state != ISOTP_WAIT_FIRST_FC) return 0; hrtimer_cancel(&so->txtimer); if ((cf->len < ae + FC_CONTENT_SZ) || ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return 1; } /* get communication parameters only from the first FC frame */ if (so->tx.state == ISOTP_WAIT_FIRST_FC) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; /* fix wrong STmin values according spec */ if (so->txfc.stmin > 0x7F && (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) so->txfc.stmin = 0x7F; so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, so->force_tx_stmin); else if (so->txfc.stmin < 0x80) so->tx_gap = ktime_add_ns(so->tx_gap, so->txfc.stmin * 1000000); else so->tx_gap = ktime_add_ns(so->tx_gap, (so->txfc.stmin - 0xF0) * 100000); so->tx.state = ISOTP_WAIT_FC; } switch (cf->data[ae] & 0x0F) { case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; /* send CF frame and enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; case ISOTP_FC_OVFLW: /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); fallthrough; default: /* stop this tx job */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); } return 0; } static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, struct sk_buff *skb, int len) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; if (!len || len > cf->len - pcilen) return 1; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, len), &cf->data[pcilen], len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) { struct isotp_sock *so = isotp_sk(sk); int i; int off; int ff_pci_sz; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; /* get the used sender LL_DL from the (first) CAN frame data length */ so->rx.ll_dl = padlen(cf->len); /* the first frame has to use the entire frame up to LL_DL length */ if (cf->len != so->rx.ll_dl) return 1; /* get the FF_DL */ so->rx.len = (cf->data[ae] & 0x0F) << 8; so->rx.len += cf->data[ae + 1]; /* Check for FF_DL escape sequence supporting 32 bit PDU length */ if (so->rx.len) { ff_pci_sz = FF_PCI_SZ12; } else { /* FF_DL = 0 => get real length from next 4 bytes */ so->rx.len = cf->data[ae + 2] << 24; so->rx.len += cf->data[ae + 3] << 16; so->rx.len += cf->data[ae + 4] << 8; so->rx.len += cf->data[ae + 5]; ff_pci_sz = FF_PCI_SZ32; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) return 1; /* PDU size > default => try max_pdu_size */ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC); if (newbuf) { so->rx.buf = newbuf; so->rx.buflen = max_pdu_size; } } if (so->rx.len > so->rx.buflen) { /* send FC frame with overflow status */ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); return 1; } /* copy the first received data bytes */ so->rx.idx = 0; for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) so->rx.buf[so->rx.idx++] = cf->data[i]; /* initial setup for this pdu reception */ so->rx.sn = 1; so->rx.state = ISOTP_WAIT_DATA; /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* send our first FC frame */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, struct sk_buff *skb) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; int i; if (so->rx.state != ISOTP_WAIT_DATA) return 0; /* drop if timestamp gap is less than force_rx_stmin nano secs */ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < so->force_rx_stmin) return 0; so->lastrxcf_tstamp = skb->tstamp; } hrtimer_cancel(&so->rxtimer); /* CFs are never longer than the FF */ if (cf->len > so->rx.ll_dl) return 1; /* CFs have usually the LL_DL length */ if (cf->len < so->rx.ll_dl) { /* this is only allowed for the last CF */ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) return 1; } if ((cf->data[ae] & 0x0F) != so->rx.sn) { /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; return 1; } so->rx.sn++; so->rx.sn %= 16; for (i = ae + N_PCI_SZ; i < cf->len; i++) { so->rx.buf[so->rx.idx++] = cf->data[i]; if (so->rx.idx >= so->rx.len) break; } if (so->rx.idx >= so->rx.len) { /* we are done */ so->rx.state = ISOTP_IDLE; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, i + 1, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(so->rx.len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, so->rx.len), so->rx.buf, so->rx.len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static void isotp_rcv(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; u8 n_pci_type, sf_dl; /* Strictly receive only frames with the configured MTU size * => clear separation of CAN2.0 / CAN FD transport channels */ if (skb->len != so->ll.mtu) return; cf = (struct canfd_frame *)skb->data; /* if enabled: check reception of my configured extended address */ if (ae && cf->data[0] != so->opt.rx_ext_address) return; n_pci_type = cf->data[ae] & 0xF0; /* Make sure the state changes and data structures stay consistent at * CAN frame reception time. This locking is not needed in real world * use cases but the inconsistency can be triggered with syzkaller. */ spin_lock(&so->rx_lock); if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) goto out_unlock; } switch (n_pci_type) { case N_PCI_FC: /* tx path: flow control frame containing the FC parameters */ isotp_rcv_fc(so, cf, ae); break; case N_PCI_SF: /* rx path: single frame * * As we do not have a rx.ll_dl configuration, we can only test * if the CAN frames payload length matches the LL_DL == 8 * requirements - no matter if it's CAN 2.0 or CAN FD */ /* get the SF_DL from the N_PCI byte */ sf_dl = cf->data[ae] & 0x0F; if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * * If so take care of the increased SF PCI size * (SF_PCI_SZ8) to point to the message content behind * the extended SF PCI info and get the real SF_DL * length value from the formerly first data byte. */ if (sf_dl == 0) isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, cf->data[SF_PCI_SZ4 + ae]); } } break; case N_PCI_FF: /* rx path: first frame */ isotp_rcv_ff(sk, cf, ae); break; case N_PCI_CF: /* rx path: consecutive frame */ isotp_rcv_cf(sk, cf, ae, skb); break; } out_unlock: spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, int ae, int off) { int pcilen = N_PCI_SZ + ae + off; int space = so->tx.ll_dl - pcilen; int num = min_t(int, so->tx.len - so->tx.idx, space); int i; cf->can_id = so->txid; cf->len = num + pcilen; if (num < space) { if (so->opt.flags & CAN_ISOTP_TX_PADDING) { /* user requested padding */ cf->len = padlen(cf->len); memset(cf->data, so->opt.txpad_content, cf->len); } else if (cf->len > CAN_MAX_DLEN) { /* mandatory padding for CAN FD frames */ cf->len = padlen(cf->len); memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, cf->len); } } for (i = 0; i < num; i++) cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; if (ae) cf->data[0] = so->opt.ext_address; } static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) return; skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); if (!skb) { dev_put(dev); return; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); /* place consecutive frame N_PCI in appropriate index */ cf->data[ae] = N_PCI_CF | so->tx.sn++; so->tx.sn %= 16; so->tx.bs++; cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); /* set consecutive frame echo tag */ so->cfecho = *(u32 *)cf->data; /* send frame with local echo enabled */ can_send_ret = can_send(skb, 1); if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); if (can_send_ret == -ENOBUFS) pr_notice_once("can-isotp: tx queue is full\n"); } dev_put(dev); } static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { int i; int ff_pci_sz; cf->can_id = so->txid; cf->len = so->tx.ll_dl; if (ae) cf->data[0] = so->opt.ext_address; /* create N_PCI bytes with 12/32 bit FF_DL data length */ if (so->tx.len > MAX_12BIT_PDU_SIZE) { /* use 32 bit FF_DL notation */ cf->data[ae] = N_PCI_FF; cf->data[ae + 1] = 0; cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ32; } else { /* use 12 bit FF_DL notation */ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ12; } /* add first data bytes depending on ae */ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; /* cancel local echo timeout */ hrtimer_cancel(&so->txtimer); /* local echo skb with consecutive frame has been consumed */ so->cfecho = 0; if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return; } if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { /* enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } /* start timer to send next consecutive frame with correct delay */ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; /* don't handle timeouts in IDLE or SHUTDOWN state */ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return HRTIMER_NORESTART; } static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txfrtimer); /* start echo timeout handling and cover below protocol error */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); /* cfecho should be consumed by isotp_rcv_echo() here */ if (so->tx.state == ISOTP_SENDING && !so->cfecho) isotp_send_cframe(so); return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; wait_free_buffer: /* we do not support multiple buffers - for now */ if (wq_has_sleeper(&so->wait) && (msg->msg_flags & MSG_DONTWAIT)) return -EAGAIN; /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; goto wait_free_buffer; } /* PDU size > default => try max_pdu_size */ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL); if (newbuf) { so->tx.buf = newbuf; so->tx.buflen = max_pdu_size; } } if (!size || size > so->tx.buflen) { err = -EINVAL; goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); goto err_out_drop; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; so->tx.len = size; so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. * * SF_DL ESC offset optimization: * * When TX_DL is greater 8 but the message would still fit * into a 8 byte CAN frame, we can omit the offset. * This prevents a protocol caused length extension from * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. */ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) off = 0; isotp_fill_dataframe(cf, so, ae, off); /* place single frame N_PCI w/o length in appropriate index */ cf->data[ae] = N_PCI_SF; /* place SF_DL size value depending on the SF_DL ESC offset */ if (off) cf->data[SF_PCI_SZ4 + ae] = size; else cf->data[ae] |= size; /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ isotp_create_fframe(cf, so, ae); if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { /* set timer for FC-less operation (STmin = 0) */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_set(0, so->force_tx_stmin); else so->tx_gap = ktime_set(0, so->frame_txtime); /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ hrtimer_sec = ISOTP_FC_TIMEOUT; /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ so->cfecho = 0; } } hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); dev_put(dev); if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; goto err_out_drop; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; err = sock_error(sk); if (err) return err; } return size; err_event_drop: /* got signal: force tx state machine to be idle */ so->tx.state = ISOTP_IDLE; hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) return -EADDRNOTAVAIL; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) goto out_err; sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); msg->msg_namelen = ISOTP_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* set length of return value */ ret = (flags & MSG_TRUNC) ? skb->len : size; out_err: skb_free_datagram(sk, skb); return ret; } static int isotp_release(struct socket *sock) { struct sock *sk = sock->sk; struct isotp_sock *so; struct net *net; if (!sk) return 0; so = isotp_sk(sk); net = sock_net(sk); /* wait for complete transmission of current pdu */ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) ; /* force state machines to be idle also when a signal occurred */ so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&isotp_notifier_lock); } list_del(&so->notifier); spin_unlock(&isotp_notifier_lock); lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { if (isotp_register_rxid(so)) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } } } hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); so->ifindex = 0; so->bound = 0; if (so->rx.buf != so->rx.sbuf) kfree(so->rx.buf); if (so->tx.buf != so->tx.sbuf) kfree(so->tx.buf); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct net *net = sock_net(sk); int ifindex; struct net_device *dev; canid_t tx_id = addr->can_addr.tp.tx_id; canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else tx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (tx_id != addr->can_addr.tp.tx_id) return -EINVAL; /* sanitize rx CAN identifier (if needed) */ if (isotp_register_rxid(so)) { if (rx_id & CAN_EFF_FLAG) rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else rx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (rx_id != addr->can_addr.tp.rx_id) return -EINVAL; } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); if (so->bound) { err = -EINVAL; goto out; } /* ensure different CAN IDs when the rx_id is to be registered */ if (isotp_register_rxid(so) && rx_id == tx_id) { err = -EADDRNOTAVAIL; goto out; } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { dev_put(dev); err = -ENODEV; goto out; } if (dev->mtu < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); /* no consecutive frame echo skb in flight */ so->cfecho = 0; /* register for echo skb's */ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; so->txid = tx_id; so->bound = 1; out: release_sock(sk); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, ISOTP_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; addr->can_addr.tp.tx_id = so->txid; return ISOTP_MIN_NAMELEN; } static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (so->bound) return -EISCONN; switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) return -EINVAL; if (copy_from_sockptr(&so->opt, optval, optlen)) return -EFAULT; /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; /* these broadcast flags are not allowed together */ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { /* CAN_ISOTP_SF_BROADCAST is prioritized */ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; /* give user feedback on wrong config attempt */ ret = -EINVAL; } /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) so->frame_txtime = 0; else so->frame_txtime = so->opt.frame_txtime; } break; case CAN_ISOTP_RECV_FC: if (optlen != sizeof(struct can_isotp_fc_options)) return -EINVAL; if (copy_from_sockptr(&so->rxfc, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_TX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_RX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_LL_OPTS: if (optlen == sizeof(struct can_isotp_ll_options)) { struct can_isotp_ll_options ll; if (copy_from_sockptr(&ll, optval, optlen)) return -EFAULT; /* check for correct ISO 11898-1 DLC data length */ if (ll.tx_dl != padlen(ll.tx_dl)) return -EINVAL; if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; if (ll.mtu == CAN_MTU && (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = ll.tx_dl; } else { return -EINVAL; } break; default: ret = -ENOPROTOOPT; } return ret; } static int isotp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int ret; if (level != SOL_CAN_ISOTP) return -EINVAL; lock_sock(sk); ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen); release_sock(sk); return ret; } static int isotp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int len; void *val; if (level != SOL_CAN_ISOTP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_ISOTP_OPTS: len = min_t(int, len, sizeof(struct can_isotp_options)); val = &so->opt; break; case CAN_ISOTP_RECV_FC: len = min_t(int, len, sizeof(struct can_isotp_fc_options)); val = &so->rxfc; break; case CAN_ISOTP_TX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_tx_stmin; break; case CAN_ISOTP_RX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_rx_stmin; break; case CAN_ISOTP_LL_OPTS: len = min_t(int, len, sizeof(struct can_isotp_ll_options)); val = &so->ll; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void isotp_notify(struct isotp_sock *so, unsigned long msg, struct net_device *dev) { struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (so->ifindex != dev->ifindex) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); } so->ifindex = 0; so->bound = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int isotp_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&isotp_notifier_lock); list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { spin_unlock(&isotp_notifier_lock); isotp_notify(isotp_busy_notifier, msg, dev); spin_lock(&isotp_notifier_lock); } isotp_busy_notifier = NULL; spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); so->ifindex = 0; so->bound = 0; so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = so->ll.tx_dl; so->rx.state = ISOTP_IDLE; so->tx.state = ISOTP_IDLE; so->rx.buf = so->rx.sbuf; so->tx.buf = so->tx.sbuf; so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); return 0; } static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); poll_wait(file, &so->wait, wait); /* Check for false positives due to TX state */ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) mask &= ~(EPOLLOUT | EPOLLWRNORM); return mask; } static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops isotp_ops = { .family = PF_CAN, .release = isotp_release, .bind = isotp_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = isotp_setsockopt, .getsockopt = isotp_getsockopt, .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, }; static struct proto isotp_proto __read_mostly = { .name = "CAN_ISOTP", .owner = THIS_MODULE, .obj_size = sizeof(struct isotp_sock), .init = isotp_init, }; static const struct can_proto isotp_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_ISOTP, .ops = &isotp_ops, .prot = &isotp_proto, }; static struct notifier_block canisotp_notifier = { .notifier_call = isotp_notifier }; static __init int isotp_module_init(void) { int err; max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE); max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE); pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size); err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); return err; } static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); module_exit(isotp_module_exit); |
18 951 952 1 279 29 29 29 29 29 16 22 13 13 10 9 29 28 29 9 29 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 7 18 18 18 18 18 18 18 58 273 273 272 272 271 271 272 101 270 1 111 1477 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Neighbour Discovery for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> */ /* * Changes: * * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt * of an RA. * Janos Farkas : kmalloc failure checks * Alexey Kuznetsov : state machine reworked * and moved to net/core. * Pekka Savola : RFC2461 validation * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ #define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/jhash.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, .reachable_time = ND_REACHABLE_TIME, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL_GPL(nd_tbl); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; memset(opt + 2, 0, pad); opt += pad; space -= pad; memcpy(opt+2, data, data_len); data_len += 2; opt += data_len; space -= data_len; if (space > 0) memset(opt, 0, space); } EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); } static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, void *ha, const u8 *ops_data) { ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) return NULL; type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64 || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && !ndisc_is_useropt(dev, cur)); return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; if (!nd_opt || opt_len < 0 || !ndopts) return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, "%s: duplicated ND6 option found: type=%d\n", __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFO: ndopts->nd_opts_pi_end = nd_opt; if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; break; #ifdef CONFIG_IPV6_ROUTE_INFO case ND_OPT_ROUTE_INFO: ndopts->nd_opts_ri_end = nd_opt; if (!ndopts->nd_opts_ri) ndopts->nd_opts_ri = nd_opt; break; #endif default: if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; } else { /* * Unknown options must be silently ignored, * to accommodate future extension to the * protocol. */ ND_PRINTK(2, notice, "%s: ignored unsupported option; type=%d, len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } } next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } return ndopts; } int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; case ARPHRD_IPGRE: return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return ndisc_hashfn(pkey, dev, hash_rnd); } static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) { return neigh_key_eq128(n, pkey); } static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); if (!in6_dev) { return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; ndisc_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); if (dev->flags&IFF_LOOPBACK) neigh->type = RTN_LOCAL; } else if (dev->flags&IFF_POINTOPOINT) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &ndisc_hh_ops; else neigh->ops = &ndisc_generic_ops; if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } in6_dev_put(in6_dev); return 0; } static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); return 0; } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } /* called with rtnl held */ static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); return false; } return true; } static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", __func__); return NULL; } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ skb_set_owner_w(skb, sk); return skb; } static void ip6_nd_hdr(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int hop_limit, int len) { struct ipv6hdr *hdr; struct inet6_dev *idev; unsigned tclass; rcu_read_lock(); idev = __in6_dev_get(skb->dev); tclass = idev ? idev->cnf.ndisc_tclass : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = hop_limit; hdr->saddr = *saddr; hdr->daddr = *daddr; } void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); return; } skb_dst_set(skb, dst); } icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, csum_partial(icmp6h, skb->len, 0)); ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } rcu_read_unlock(); } EXPORT_SYMBOL(ndisc_send_skb); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; struct nd_msg *msg; int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; inc_opt |= ifp->idev->cnf.force_tllao; in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, &tmpaddr)) return; src_addr = &tmpaddr; } if (!dev->addr_len) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, .icmp6_router = router, .icmp6_solicited = solicited, .icmp6_override = override, }, .target = *solicited_addr, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; idev = in6_dev_get(dev); if (!idev) return; read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { /* skip tentative addresses until dad completes */ if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_OPTIMISTIC)) continue; ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); } read_unlock_bh(&idev->lock); in6_dev_put(idev); } struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce) { int inc_opt = dev->addr_len; struct sk_buff *skb; struct nd_msg *msg; int optlen = 0; if (!saddr) return NULL; if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return NULL; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, }, .target = *solicit, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) { u8 *opt = skb_put(skb, 8); opt[0] = ND_OPT_NONCE; opt[1] = 8 >> 3; memcpy(opt + 2, &nonce, 6); } return skb; } EXPORT_SYMBOL(ndisc_ns_create); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce) { struct in6_addr addr_buf; struct sk_buff *skb; if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } skb = ndisc_ns_create(dev, solicit, saddr, nonce); if (skb) ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct sk_buff *skb; struct rs_msg *msg; int send_sllao = dev->addr_len; int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * According to section 2.2 of RFC 4429, we must not * send router solicitations with a sllao from * optimistic addresses, but we may send the solicitation * if we don't include the sllao. So here we check * if our address is optimistic, and if so, we * suppress the inclusion of the sllao. */ if (send_sllao) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; } in6_ifa_put(ifp); } else { send_sllao = 0; } } #endif if (send_sllao) optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct rs_msg) { .icmph = { .icmp6_type = NDISC_ROUTER_SOLICITATION, }, }; if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { /* * "The sender MUST return an ICMP * destination unreachable" */ dst_link_failure(skb); kfree_skb(skb); } /* Called with locked neigh: either read or both */ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct net_device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { ND_PRINTK(1, dbg, "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } static int pndisc_is_router(const void *pkey, struct net_device *dev) { struct pneigh_entry *n; int ret = -1; read_lock_bh(&nd_tbl.lock); n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); return ret; } void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); int is_router = -1; SKB_DR(reason); u64 nonce = 0; bool inc; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return reason; } /* * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return reason; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); return reason; } /* RFC2461 7.1.1: * If the IP source address is the unspecified address, * there MUST NOT be source link-layer address option * in the message. */ if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ ND_PRINTK(2, notice, "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", ifp->idev->dev->name, &ifp->addr, np); goto out; } /* * We are colliding with another node * who is doing DAD * so fail our DAD process */ addrconf_dad_failure(skb, ifp); return reason; } else { /* * This is not a dad solicitation. * If we are an optimistic node, * we should respond. * Otherwise, we should ignore it. */ if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; } } idev = ifp->idev; } else { struct net *net = dev_net(dev); /* perhaps an address on the master device */ if (netif_is_l3_slave(dev)) { struct net_device *mdev; mdev = netdev_master_upper_dev_get_rcu(dev); if (mdev) { ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); if (ifp) goto have_ifp; } } idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || (idev->cnf.forwarding && (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc && NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response * by a random time between 0 and * MAX_ANYCAST_DELAY_TIME seconds. * (RFC2461) -- yoshfuji */ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); if (n) pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } } else { SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; } } if (is_router < 0) is_router = idev->cnf.forwarding; if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } if (inc) NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); else NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); /* * update / create cache entry * for the source address */ neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE, NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); reason = SKB_CONSUMED; } out: if (ifp) in6_ifa_put(ifp); else in6_dev_put(idev); return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); switch (idev->cnf.accept_untracked_na) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ return 1; case 2: /* Create new entries from untracked na only if saddr is in the * same subnet as an address configured on the interface that * received the na */ return !!ipv6_chk_prefix(saddr, dev); default: return 0; } } static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; SKB_DR(reason); u8 new_state; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return reason; } /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && idev->cnf.drop_unsolicited_na) return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) We should not print the error if NA has been received from loopback - it is just our own unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); /* RFC 9131 updates original Neighbour Discovery RFC 4861. * NAs with Target LL Address option without a corresponding * entry in the neighbour cache can now create a STALE neighbour * cache entry on routers. * * entry accept fwding solicited behaviour * ------- ------ ------ --------- ---------------------- * present X X 0 Set state to STALE * present X X 1 Set state to REACHABLE * absent 0 X X Do nothing * absent 1 0 X Do nothing * absent 1 1 X Add a new STALE entry * * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; if (!neigh && lladdr && idev && idev->cnf.forwarding) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; } } if (neigh && !IS_ERR(neigh)) { u8 old_flags = neigh->flags; struct net *net = dev_net(dev); if (READ_ONCE(neigh->nud_state) & NUD_FAILED) goto out; /* * Don't update the neighbor cache entry on a proxy NA from * ourselves because either the proxied node is off link or it * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } ndisc_update(dev, neigh, lladdr, new_state, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* * Change: router to host */ rt6_clean_tohost(dev_net(dev), saddr); } reason = SKB_CONSUMED; out: neigh_release(neigh); } return reason; } static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); return reason; } /* Don't accept RS if we're not in router mode */ if (!idev->cnf.forwarding) goto out; /* * Don't update NCE if src = ::; * this implies that the source node has no ip address assigned yet. */ if (ipv6_addr_any(saddr)) goto out; /* Parse ND options */ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) goto out; } neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); reason = SKB_CONSUMED; } out: return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) { struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); if (!nlh) { goto nla_put_failure; } ndmsg = nlmsg_data(nlh); ndmsg->nduseropt_family = AF_INET6; ndmsg->nduseropt_ifindex = ra->dev->ifindex; ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; struct net *net; SKB_DR(reason); int lifetime; int optlen; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); ND_PRINTK(2, info, "RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return reason; } if (optlen < 0) return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #endif if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent * out on this interface. */ in6_dev->if_flags |= IF_RA_RCVD; } /* * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? IF_RA_MANAGED : 0) | (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_lft) { ND_PRINTK(2, info, "RA: router lifetime (%ds) is too short: %s\n", lifetime, skb->dev->name); goto skip_defrtr; } /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); goto skip_defrtr; } #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } } /* Set default route metric as specified by user */ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; /* delete the route if lifetime is 0 or if metric needs change */ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); if (neigh) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev, pref, defrtr_usr_metric); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { struct nl_info nlinfo = { .nl_net = net, }; rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } if (rt) fib6_set_expires(rt, jiffies + (HZ * lifetime)); if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } } skip_defrtr: /* * Update Reachable Time and Retrans Timer */ if (in6_dev->nd_parms) { unsigned long rtime = ntohl(ra_msg->retrans_timer); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; if (rtime < HZ/100) rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { NEIGH_VAR_SET(in6_dev->nd_parms, BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } } } skip_linkparms: /* * Process options. */ if (!neigh) neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { ND_PRINTK(2, warn, "RA: invalid link-layer address length\n"); goto out; } } ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, accept_ra is false for dev: %s\n", __func__, skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); goto skip_routeinfo; } if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { struct route_info *ri = (struct route_info *)p; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && ri->prefix_len == 0) continue; #endif if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; if (ri->lifetime != 0 && ntohl(ri->lifetime) < in6_dev->cnf.accept_ra_min_lft) continue; if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", __func__, skb->dev->name); goto out; } #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { addrconf_prefix_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, ndopts.nd_opts_src_lladdr != NULL); } } if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) { __be32 n; u32 mtu; memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (in6_dev->ra_mtu != mtu) { in6_dev->ra_mtu = mtu; send_ifinfo_notify = true; } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } if (ndopts.nd_useropts) { struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; p = ndisc_next_useropt(skb->dev, p, ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or * timer settings or ra_mtu value */ if (send_ifinfo_notify) inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); fib6_info_release(rt); if (neigh) neigh_release(neigh); return reason; } static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); struct ndisc_options ndopts; SKB_DR(reason); u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return SKB_DROP_REASON_PKT_TOO_SMALL; return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, struct sk_buff *orig_skb, int rd_len) { u8 *opt = skb_put(skb, rd_len); memset(opt, 0, 8); *(opt++) = ND_OPT_REDIRECT_HDR; *(opt++) = (rd_len >> 3); opt += 6; skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; struct rd_msg *msg; struct in6_addr saddr_buf; struct rt6_info *rt; struct dst_entry *dst; struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: target address is not link-local unicast\n"); return; } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { ND_PRINTK(2, warn, "Redirect: no neigh for target address\n"); goto release; } read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; optlen += ndisc_redirect_opt_addr_space(dev, neigh, ops_data_buf, &ops_data); } else read_unlock_bh(&neigh->lock); neigh_release(neigh); } rd_len = min_t(unsigned int, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, skb->len + 8); rd_len &= ~0x7; optlen += rd_len; buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!buff) goto release; msg = skb_put(buff, sizeof(*msg)); *msg = (struct rd_msg) { .icmph = { .icmp6_type = NDISC_REDIRECT, }, .target = *target, .dest = ipv6_hdr(skb)->daddr, }; /* * include target_address option */ if (ha) ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. */ if (rd_len) ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) { enum skb_drop_reason reason = ndisc_recv_ns(skb); kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) { return ipv6_addr_is_multicast((struct in6_addr *)pkey); } static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && idev->cnf.suppress_frag_ndisc) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: reason = ndisc_redirect_rcv(skb); break; } return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); fallthrough; case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; if (idev->cnf.ndisc_notify || net->ipv6.devconf_all->ndisc_notify) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; case NETDEV_CHANGE: idev = in6_dev_get(dev); if (!idev) evict_nocarrier = true; else { evict_nocarrier = idev->cnf.ndisc_evict_nocarrier && net->ipv6.devconf_all->ndisc_evict_nocarrier; in6_dev_put(idev); } change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); break; case NETDEV_NOTIFY_PEERS: ndisc_send_unsol_na(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); warned++; } } int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; int ret; if ((strcmp(ctl->procname, "retrans_time") == 0) || (strcmp(ctl->procname, "base_reachable_time") == 0)) ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } return ret; } #endif static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { ND_PRINTK(0, err, "NDISC: Failed to initialize the control socket (err %d)\n", err); return err; } net->ipv6.ndisc_sk = sk; np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ np->mc_loop = 0; return 0; } static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, }; int __init ndisc_init(void) { int err; err = register_pernet_subsys(&ndisc_net_ops); if (err) return err; /* * Initialize the neighbour table */ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: #endif return err; #ifdef CONFIG_SYSCTL out_unregister_pernet: unregister_pernet_subsys(&ndisc_net_ops); goto out; #endif } int __init ndisc_late_init(void) { return register_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_cleanup(void) { #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); } |
739 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; dscp_t fa_dscp; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload; u8 trap; u8 offload_failed; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */ |
319 1 320 3304 1439 1439 1156 556 2939 1491 18 1323 546 320 45 2249 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ #ifndef _EXT4_EXTENTS #define _EXT4_EXTENTS #include "ext4.h" /* * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks * becomes very small, so index split, in-depth growing and * other hard changes happen much more often. * This is for debug purposes only. */ #define AGGRESSIVE_TEST_ /* * With EXTENTS_STATS defined, the number of blocks and extents * are collected in the truncate path. They'll be shown at * umount time. */ #define EXTENTS_STATS__ /* * If CHECK_BINSEARCH is defined, then the results of the binary search * will also be checked by linear search. */ #define CHECK_BINSEARCH__ /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. */ #define EXT_STATS_ /* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. * For non-inode extent blocks, ext4_extent_tail * follows the array. */ /* * This is the extent tail on-disk structure. * All other extent structures are 12 bytes long. It turns out that * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which * covers all valid ext4 block sizes. Therefore, this tail structure can be * crammed into the end of the block without having to rebalance the tree. */ struct ext4_extent_tail { __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ }; /* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ struct ext4_extent { __le32 ee_block; /* first logical block extent covers */ __le16 ee_len; /* number of blocks covered by extent */ __le16 ee_start_hi; /* high 16 bits of physical block */ __le32 ee_start_lo; /* low 32 bits of physical block */ }; /* * This is index on-disk structure. * It's used at all the levels except the bottom. */ struct ext4_extent_idx { __le32 ei_block; /* index covers logical blocks from 'block' */ __le32 ei_leaf_lo; /* pointer to the physical block of the next * * level. leaf or next index could be there */ __le16 ei_leaf_hi; /* high 16 bits of physical block */ __u16 ei_unused; }; /* * Each block (leaves and indexes), even inode-stored has header. */ struct ext4_extent_header { __le16 eh_magic; /* probably will support different formats */ __le16 eh_entries; /* number of valid entries */ __le16 eh_max; /* capacity of store in entries */ __le16 eh_depth; /* has tree real underlying blocks? */ __le32 eh_generation; /* generation of the tree */ }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) #define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) static inline struct ext4_extent_tail * find_ext4_extent_tail(struct ext4_extent_header *eh) { return (struct ext4_extent_tail *)(((void *)eh) + EXT4_EXTENT_TAIL_OFFSET(eh)); } /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. * Truncate uses it to simulate recursive walking. */ struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; struct buffer_head *p_bh; }; /* * Used to record a portion of a cluster found at the beginning or end * of an extent while traversing the extent tree during space removal. * A partial cluster may be removed if it does not contain blocks shared * with extents that aren't being deleted (tofree state). Otherwise, * it cannot be removed (nofree state). */ struct partial_cluster { ext4_fsblk_t pclu; /* physical cluster number */ ext4_lblk_t lblk; /* logical block number within logical cluster */ enum {initial, tofree, nofree} state; }; /* * structure for external API */ /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this * particular extent is an initialized extent or an unwritten (i.e. * preallocated). * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an * unwritten one. In other words, if MSB of ee_len is set, it is an * unwritten extent with only one special scenario when ee_len = 0x8000. * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) #define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ ((struct ext4_extent *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_FIRST_INDEX(__hdr__) \ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_HAS_FREE_INDEX(__path__) \ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ < le16_to_cpu((__path__)->p_hdr->eh_max)) #define EXT_LAST_EXTENT(__hdr__) \ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { return (struct ext4_extent_header *) EXT4_I(inode)->i_data; } static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) { return (struct ext4_extent_header *) bh->b_data; } static inline unsigned short ext_depth(struct inode *inode) { return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); } static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) { return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? le16_to_cpu(ext->ee_len) : (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) { ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } /* * ext4_ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; block = le32_to_cpu(ex->ee_start_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; return block; } /* * ext4_idx_pblock: * combine low and high parts of a leaf physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) { ext4_fsblk_t block; block = le32_to_cpu(ix->ei_leaf_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; return block; } /* * ext4_ext_store_pblock: * stores a large physical block number into an extent struct, * breaking it into parts */ static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) { ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } /* * ext4_idx_store_pblock: * stores a large physical block number into an index struct, * breaking it into parts */ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) { ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } #endif /* _EXT4_EXTENTS */ |
314 469 469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * * Common inline functions needed for truncate support */ /* * Truncate blocks that were not used by write. We have to truncate the * pagecache as well so that corresponding buffers get properly unmapped. */ static inline void ext4_truncate_failed_write(struct inode *inode) { struct address_space *mapping = inode->i_mapping; /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ filemap_invalidate_lock(mapping); truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); filemap_invalidate_unlock(mapping); } /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) { ext4_lblk_t needed; needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); /* Give ourselves just enough room to cope with inodes in which * i_blocks is corrupt: we've seen disk corruptions in the past * which resulted in random data in an inode which looked enough * like a regular file for ext4 to try to delete it. Things * will go a bit crazy if that happens, but at least we should * try not to panic the whole kernel. */ if (needed < 2) needed = 2; /* But we need to bound the transaction so we don't overflow the * journal. */ if (needed > EXT4_MAX_TRANS_DATA) needed = EXT4_MAX_TRANS_DATA; return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } |
1218 657 657 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction attribute tables * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" /* Attribute search APIs */ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) { return inat_primary_table[opcode]; } int inat_get_last_prefix_id(insn_byte_t last_pfx) { insn_attr_t lpfx_attr; lpfx_attr = inat_get_opcode_attribute(last_pfx); return inat_last_prefix_id(lpfx_attr); } insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; int n; n = inat_escape_id(esc_attr); table = inat_escape_tables[n][0]; if (!table) return 0; if (inat_has_variant(table[opcode]) && lpfx_id) { table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; int n; n = inat_group_id(grp_attr); table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } return table[X86_MODRM_REG(modrm)] | inat_group_common_attribute(grp_attr); } insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_p) { const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; /* At first, this checks the master table */ table = inat_avx_tables[vex_m][0]; if (!table) return 0; if (!inat_is_group(table[opcode]) && vex_p) { /* If this is not a group, get attribute directly */ table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; } return table[opcode]; } |
3176 3176 5423 263 3176 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif |
1226 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SYNC_CORE_H #define _ASM_X86_SYNC_CORE_H #include <linux/preempt.h> #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> #ifdef CONFIG_X86_32 static inline void iret_to_self(void) { asm volatile ( "pushfl\n\t" "pushl %%cs\n\t" "pushl $1f\n\t" "iret\n\t" "1:" : ASM_CALL_CONSTRAINT : : "memory"); } #else static inline void iret_to_self(void) { unsigned int tmp; asm volatile ( "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" "addq $8, (%%rsp)\n\t" "pushfq\n\t" "mov %%cs, %0\n\t" "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" "1:" : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); } #endif /* CONFIG_X86_32 */ /* * This function forces the icache and prefetched instruction stream to * catch up with reality in two very specific cases: * * a) Text was modified using one virtual address and is about to be executed * from the same physical page at a different virtual address. * * b) Text was modified on a different CPU, may subsequently be * executed on this CPU, and you want to make sure the new version * gets executed. This generally means you're calling this in an IPI. * * If you're calling this for a different reason, you're probably doing * it wrong. * * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ static inline void sync_core(void) { /* * The SERIALIZE instruction is the most straightforward way to * do this, but it is not universally available. */ if (static_cpu_has(X86_FEATURE_SERIALIZE)) { serialize(); return; } /* * For all other processors, there are quite a few ways to do this. * IRET-to-self is nice because it works on every CPU, at any CPL * (so it's compatible with paravirtualization), and it never exits * to a hypervisor. The only downsides are that it's a bit slow * (it seems to be a bit more than 2x slower than the fastest * options) and that it unmasks NMIs. The "push %cs" is needed, * because in paravirtual environments __KERNEL_CS may not be a * valid CS value when we do IRET directly. * * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a * hypervisor. */ iret_to_self(); } /* * Ensure that a core serializing instruction is issued before returning * to user-mode. x86 implements return to user-space through sysexit, * sysrel, and sysretq, which are not core serializing. */ static inline void sync_core_before_usermode(void) { /* With PTI, we unconditionally serialize before running user code. */ if (static_cpu_has(X86_FEATURE_PTI)) return; /* * Even if we're in an interrupt, we might reschedule before returning, * in which case we could switch to a different thread in the same mm * and return using SYSRET or SYSEXIT. Instead of trying to keep * track of our need to sync the core, just sync right away. */ sync_core(); } #endif /* _ASM_X86_SYNC_CORE_H */ |
6 6 6 6 6 6 6 6 11 11 11 9 6 4 2 2 2 2 2 6 7 7 7 6 4 3 4 2 2 7 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 | // SPDX-License-Identifier: GPL-2.0 /* * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2022 Intel Corporation */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/export.h> #include <net/cfg80211.h> #include "wext-compat.h" #include "nl80211.h" #include "rdev-ops.h" void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_bss *bss; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return; if (!wdev->u.ibss.ssid_len) return; bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } cfg80211_hold_bss(bss_from_pub(bss)); wdev->u.ibss.current_bss = bss_from_pub(bss); cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); #endif } void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; trace_cfg80211_ibss_joined(dev, bssid, channel); if (WARN_ON(!channel)) return; ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_IBSS_JOINED; memcpy(ev->ij.bssid, bssid, ETH_ALEN); ev->ij.channel = channel; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_ibss_joined); int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params, struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); if (wdev->u.ibss.ssid_len) return -EALREADY; if (!params->basic_rates) { /* * If no rates were explicitly configured, * use the mandatory rate set for 11b or * 11a for maximum compatibility. */ struct ieee80211_supported_band *sband; enum nl80211_band band; u32 flag; int j; band = params->chandef.chan->band; if (band == NL80211_BAND_5GHZ || band == NL80211_BAND_6GHZ) flag = IEEE80211_RATE_MANDATORY_A; else flag = IEEE80211_RATE_MANDATORY_B; sband = rdev->wiphy.bands[band]; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].flags & flag) params->basic_rates |= BIT(j); } } if (WARN_ON(connkeys && connkeys->def < 0)) return -EINVAL; if (WARN_ON(wdev->connect_keys)) kfree_sensitive(wdev->connect_keys); wdev->connect_keys = connkeys; wdev->u.ibss.chandef = params->chandef; if (connkeys) { params->wep_keys = connkeys->params; params->wep_tx_key = connkeys->def; } #ifdef CONFIG_CFG80211_WEXT wdev->wext.ibss.chandef = params->chandef; #endif err = rdev_join_ibss(rdev, dev, params); if (err) { wdev->connect_keys = NULL; return err; } memcpy(wdev->u.ibss.ssid, params->ssid, params->ssid_len); wdev->u.ibss.ssid_len = params->ssid_len; return 0; } static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; ASSERT_WDEV_LOCK(wdev); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; rdev_set_qos_map(rdev, dev, NULL); /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) rdev_del_key(rdev, dev, -1, i, false, NULL); if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } wdev->u.ibss.current_bss = NULL; wdev->u.ibss.ssid_len = 0; memset(&wdev->u.ibss.chandef, 0, sizeof(wdev->u.ibss.chandef)); #ifdef CONFIG_CFG80211_WEXT if (!nowext) wdev->wext.ibss.ssid_len = 0; #endif cfg80211_sched_dfs_chan_update(rdev); } void cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; wdev_lock(wdev); __cfg80211_clear_ibss(dev, nowext); wdev_unlock(wdev); } int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; ASSERT_WDEV_LOCK(wdev); if (!wdev->u.ibss.ssid_len) return -ENOLINK; err = rdev_leave_ibss(rdev, dev); if (err) return err; wdev->conn_owner_nlportid = 0; __cfg80211_clear_ibss(dev, nowext); return 0; } int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; wdev_lock(wdev); err = __cfg80211_leave_ibss(rdev, dev, nowext); wdev_unlock(wdev); return err; } #ifdef CONFIG_CFG80211_WEXT int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; enum nl80211_band band; int i, err; ASSERT_WDEV_LOCK(wdev); if (!wdev->wext.ibss.beacon_interval) wdev->wext.ibss.beacon_interval = 100; /* try to find an IBSS channel if none requested ... */ if (!wdev->wext.ibss.chandef.chan) { struct ieee80211_channel *new_chan = NULL; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; sband = rdev->wiphy.bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { chan = &sband->channels[i]; if (chan->flags & IEEE80211_CHAN_NO_IR) continue; if (chan->flags & IEEE80211_CHAN_DISABLED) continue; new_chan = chan; break; } if (new_chan) break; } if (!new_chan) return -EINVAL; cfg80211_chandef_create(&wdev->wext.ibss.chandef, new_chan, NL80211_CHAN_NO_HT); } /* don't join -- SSID is not there */ if (!wdev->wext.ibss.ssid_len) return 0; if (!netif_running(wdev->netdev)) return 0; if (wdev->wext.keys) wdev->wext.keys->def = wdev->wext.default_key; wdev->wext.ibss.privacy = wdev->wext.default_key != -1; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, &wdev->wext.ibss, ck); if (err) kfree(ck); return err; } int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_NO_IR || chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->wext.ibss.chandef.chan == chan) return 0; wdev_lock(wdev); err = 0; if (wdev->u.ibss.ssid_len) err = __cfg80211_leave_ibss(rdev, dev, true); wdev_unlock(wdev); if (err) return err; if (chan) { cfg80211_chandef_create(&wdev->wext.ibss.chandef, chan, NL80211_CHAN_NO_HT); wdev->wext.ibss.channel_fixed = true; } else { /* cfg80211_ibss_wext_join will pick one if needed */ wdev->wext.ibss.channel_fixed = false; } wdev_lock(wdev); err = cfg80211_ibss_wext_join(rdev, wdev); wdev_unlock(wdev); return err; } int cfg80211_ibss_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; wdev_lock(wdev); if (wdev->u.ibss.current_bss) chan = wdev->u.ibss.current_bss->pub.channel; else if (wdev->wext.ibss.chandef.chan) chan = wdev->wext.ibss.chandef.chan; wdev_unlock(wdev); if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_ibss_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; wdev_lock(wdev); err = 0; if (wdev->u.ibss.ssid_len) err = __cfg80211_leave_ibss(rdev, dev, true); wdev_unlock(wdev); if (err) return err; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; memcpy(wdev->u.ibss.ssid, ssid, len); wdev->wext.ibss.ssid = wdev->u.ibss.ssid; wdev->wext.ibss.ssid_len = len; wdev_lock(wdev); err = cfg80211_ibss_wext_join(rdev, wdev); wdev_unlock(wdev); return err; } int cfg80211_ibss_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; data->flags = 0; wdev_lock(wdev); if (wdev->u.ibss.ssid_len) { data->flags = 1; data->length = wdev->u.ibss.ssid_len; memcpy(ssid, wdev->u.ibss.ssid, data->length); } else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) { data->flags = 1; data->length = wdev->wext.ibss.ssid_len; memcpy(ssid, wdev->wext.ibss.ssid, data->length); } wdev_unlock(wdev); return 0; } int cfg80211_ibss_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (bssid && !is_valid_ether_addr(bssid)) return -EINVAL; /* both automatic */ if (!bssid && !wdev->wext.ibss.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.ibss.bssid && bssid && ether_addr_equal(bssid, wdev->wext.ibss.bssid)) return 0; wdev_lock(wdev); err = 0; if (wdev->u.ibss.ssid_len) err = __cfg80211_leave_ibss(rdev, dev, true); wdev_unlock(wdev); if (err) return err; if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.ibss.bssid = wdev->wext.bssid; } else wdev->wext.ibss.bssid = NULL; wdev_lock(wdev); err = cfg80211_ibss_wext_join(rdev, wdev); wdev_unlock(wdev); return err; } int cfg80211_ibss_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; wdev_lock(wdev); if (wdev->u.ibss.current_bss) memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid, ETH_ALEN); else if (wdev->wext.ibss.bssid) memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); wdev_unlock(wdev); return 0; } #endif |
2 62 62 1 1 62 62 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 | // SPDX-License-Identifier: GPL-2.0-only /* * vxcan.c - Virtual CAN Tunnel for cross namespace communication * * This code is derived from drivers/net/can/vcan.c for the virtual CAN * specific parts and from drivers/net/veth.c to implement the netlink API * for network interface pairs in a common and established way. * * Copyright (c) 2017 Oliver Hartkopp <socketcan@hartkopp.net> */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/can/vxcan.h> #include <linux/can/can-ml.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vxcan" MODULE_DESCRIPTION("Virtual CAN Tunnel"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); struct vxcan_priv { struct net_device __rcu *peer; }; static netdev_tx_t vxcan_xmit(struct sk_buff *oskb, struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer; struct net_device_stats *peerstats, *srcstats = &dev->stats; struct sk_buff *skb; unsigned int len; if (can_dropped_invalid_skb(dev, oskb)) return NETDEV_TX_OK; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (unlikely(!peer)) { kfree_skb(oskb); dev->stats.tx_dropped++; goto out_unlock; } skb_tx_timestamp(oskb); skb = skb_clone(oskb, GFP_ATOMIC); if (skb) { consume_skb(oskb); } else { kfree_skb(oskb); goto out_unlock; } /* reset CAN GW hop counter */ skb->csum_start = 0; skb->pkt_type = PACKET_BROADCAST; skb->dev = peer; skb->ip_summed = CHECKSUM_UNNECESSARY; len = can_skb_get_data_len(skb); if (netif_rx(skb) == NET_RX_SUCCESS) { srcstats->tx_packets++; srcstats->tx_bytes += len; peerstats = &peer->stats; peerstats->rx_packets++; peerstats->rx_bytes += len; } out_unlock: rcu_read_unlock(); return NETDEV_TX_OK; } static int vxcan_open(struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } return 0; } static int vxcan_close(struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); netif_carrier_off(dev); if (peer) netif_carrier_off(peer); return 0; } static int vxcan_get_iflink(const struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer; int iflink; rcu_read_lock(); peer = rcu_dereference(priv->peer); iflink = peer ? peer->ifindex : 0; rcu_read_unlock(); return iflink; } static int vxcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; dev->mtu = new_mtu; return 0; } static const struct net_device_ops vxcan_netdev_ops = { .ndo_open = vxcan_open, .ndo_stop = vxcan_close, .ndo_start_xmit = vxcan_xmit, .ndo_get_iflink = vxcan_get_iflink, .ndo_change_mtu = vxcan_change_mtu, }; static const struct ethtool_ops vxcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vxcan_setup(struct net_device *dev) { struct can_ml_priv *can_ml; dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; dev->netdev_ops = &vxcan_netdev_ops; dev->ethtool_ops = &vxcan_ethtool_ops; dev->needs_free_netdev = true; can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); can_set_ml_priv(dev, can_ml); } /* forward declaration for rtnl_create_link() */ static struct rtnl_link_ops vxcan_link_ops; static int vxcan_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxcan_priv *priv; struct net_device *peer; struct net *peer_net; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb; char ifname[IFNAMSIZ]; unsigned char name_assign_type; struct ifinfomsg *ifmp = NULL; int err; /* register peer device */ if (data && data[VXCAN_INFO_PEER]) { struct nlattr *nla_peer; nla_peer = data[VXCAN_INFO_PEER]; ifmp = nla_data(nla_peer); err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); if (err < 0) return err; tbp = peer_tb; } if (ifmp && tbp[IFLA_IFNAME]) { nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } peer_net = rtnl_link_get_net(net, tbp); if (IS_ERR(peer_net)) return PTR_ERR(peer_net); peer = rtnl_create_link(peer_net, ifname, name_assign_type, &vxcan_link_ops, tbp, extack); if (IS_ERR(peer)) { put_net(peer_net); return PTR_ERR(peer); } if (ifmp && dev->ifindex) peer->ifindex = ifmp->ifi_index; err = register_netdevice(peer); put_net(peer_net); peer_net = NULL; if (err < 0) { free_netdev(peer); return err; } netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto unregister_network_device; /* register first device */ if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); err = register_netdevice(dev); if (err < 0) goto unregister_network_device; netif_carrier_off(dev); /* cross link the device pair */ priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); return 0; unregister_network_device: unregister_netdevice(peer); return err; } static void vxcan_dellink(struct net_device *dev, struct list_head *head) { struct vxcan_priv *priv; struct net_device *peer; priv = netdev_priv(dev); peer = rtnl_dereference(priv->peer); /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. */ RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(dev, head); if (peer) { priv = netdev_priv(peer); RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(peer, head); } } static const struct nla_policy vxcan_policy[VXCAN_INFO_MAX + 1] = { [VXCAN_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, }; static struct net *vxcan_get_link_net(const struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); return peer ? dev_net(peer) : dev_net(dev); } static struct rtnl_link_ops vxcan_link_ops = { .kind = DRV_NAME, .priv_size = ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN) + sizeof(struct can_ml_priv), .setup = vxcan_setup, .newlink = vxcan_newlink, .dellink = vxcan_dellink, .policy = vxcan_policy, .maxtype = VXCAN_INFO_MAX, .get_link_net = vxcan_get_link_net, }; static __init int vxcan_init(void) { pr_info("vxcan: Virtual CAN Tunnel driver\n"); return rtnl_link_register(&vxcan_link_ops); } static __exit void vxcan_exit(void) { rtnl_link_unregister(&vxcan_link_ops); } module_init(vxcan_init); module_exit(vxcan_exit); |
1497 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE"); |
6447 6448 6448 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/uaccess.h> #include <linux/kernel.h> #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { unsigned long vaddr = (unsigned long)unsafe_src; /* * Range covering the highest possible canonical userspace address * as well as non-canonical address range. For the canonical range * we also need to include the userspace guard page. */ return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return (unsigned long)unsafe_src >= TASK_SIZE_MAX; } #endif |
8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. * Copyright (c) 2020, Linaro Ltd. */ #include <linux/module.h> #include <linux/qrtr.h> #include <linux/workqueue.h> #include <net/sock.h> #include "qrtr.h" #include <trace/events/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> static DEFINE_XARRAY(nodes); static struct { struct socket *sock; struct sockaddr_qrtr bcast_sq; struct list_head lookups; struct workqueue_struct *workqueue; struct work_struct work; int local_node; } qrtr_ns; static const char * const qrtr_ctrl_pkt_strings[] = { [QRTR_TYPE_HELLO] = "hello", [QRTR_TYPE_BYE] = "bye", [QRTR_TYPE_NEW_SERVER] = "new-server", [QRTR_TYPE_DEL_SERVER] = "del-server", [QRTR_TYPE_DEL_CLIENT] = "del-client", [QRTR_TYPE_RESUME_TX] = "resume-tx", [QRTR_TYPE_EXIT] = "exit", [QRTR_TYPE_PING] = "ping", [QRTR_TYPE_NEW_LOOKUP] = "new-lookup", [QRTR_TYPE_DEL_LOOKUP] = "del-lookup", }; struct qrtr_server_filter { unsigned int service; unsigned int instance; unsigned int ifilter; }; struct qrtr_lookup { unsigned int service; unsigned int instance; struct sockaddr_qrtr sq; struct list_head li; }; struct qrtr_server { unsigned int service; unsigned int instance; unsigned int node; unsigned int port; struct list_head qli; }; struct qrtr_node { unsigned int id; struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; node = xa_load(&nodes, node_id); if (node) return node; /* If node didn't exist, allocate and insert it to the tree */ node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return NULL; node->id = node_id; xa_init(&node->servers); if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } return node; } static int server_match(const struct qrtr_server *srv, const struct qrtr_server_filter *f) { unsigned int ifilter = f->ifilter; if (f->service != 0 && srv->service != f->service) return 0; if (!ifilter && f->instance) ifilter = ~0; return (srv->instance & ifilter) == f->instance; } static int service_announce_new(struct sockaddr_qrtr *dest, struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; trace_qrtr_ns_service_announce_new(srv->service, srv->instance, srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_NEW_SERVER); pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); return kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); } static int service_announce_del(struct sockaddr_qrtr *dest, struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; trace_qrtr_ns_service_announce_del(srv->service, srv->instance, srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_SERVER); pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) pr_err("failed to announce del service\n"); return ret; } static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, bool new) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = new ? cpu_to_le32(QRTR_TYPE_NEW_SERVER) : cpu_to_le32(QRTR_TYPE_DEL_SERVER); if (srv) { pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); } msg.msg_name = (struct sockaddr *)to; msg.msg_namelen = sizeof(*to); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) pr_err("failed to send lookup notification\n"); } static int announce_servers(struct sockaddr_qrtr *sq) { struct qrtr_server *srv; struct qrtr_node *node; unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; /* Announce the list of servers registered in this node */ xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } } return 0; } static struct qrtr_server *server_add(unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_server *srv; struct qrtr_server *old; struct qrtr_node *node; if (!service || !port) return NULL; srv = kzalloc(sizeof(*srv), GFP_KERNEL); if (!srv) return NULL; srv->service = service; srv->instance = instance; srv->node = node_id; srv->port = port; node = node_get(node_id); if (!node) goto err; /* Delete the old server on the same port */ old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { if (xa_is_err(old)) { pr_err("failed to add server [0x%x:0x%x] ret:%d\n", srv->service, srv->instance, xa_err(old)); goto err; } else { kfree(old); } } trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); return srv; err: kfree(srv); return NULL; } static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) { struct qrtr_lookup *lookup; struct qrtr_server *srv; struct list_head *li; srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) service_announce_del(&qrtr_ns.bcast_sq, srv); /* Announce the service's disappearance to observers */ list_for_each(li, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->service && lookup->service != srv->service) continue; if (lookup->instance && lookup->instance != srv->instance) continue; lookup_notify(&lookup->sq, srv, false); } kfree(srv); return 0; } static int say_hello(struct sockaddr_qrtr *dest) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_HELLO); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) pr_err("failed to send hello msg\n"); return ret; } /* Announce the list of servers registered on the local node */ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) { int ret; ret = say_hello(sq); if (ret < 0) return ret; return announce_servers(sq); } static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; unsigned long index; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); node = node_get(from->sq_node); if (!node) return 0; /* Advertise removal of this client to all servers of remote node */ xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); if (!local_node) return 0; memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) { pr_err("failed to send bye cmd\n"); return ret; } } return 0; } static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct qrtr_node *node; struct list_head *tmp; struct list_head *li; unsigned long index; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); /* Don't accept spoofed messages */ if (from->sq_node != node_id) return -EINVAL; /* Local DEL_CLIENT messages comes from the port being closed */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; /* Remove any lookups by this client */ list_for_each_safe(li, tmp, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->sq.sq_node != node_id) continue; if (lookup->sq.sq_port != port) continue; list_del(&lookup->li); kfree(lookup); } /* Remove the server belonging to this port but don't broadcast * DEL_SERVER. Neighbours would've already removed the server belonging * to this port due to the DEL_CLIENT broadcast from qrtr_port_remove(). */ node = node_get(node_id); if (node) server_del(node, port, false); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); if (!local_node) return 0; memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) { pr_err("failed to send del client cmd\n"); return ret; } } return 0; } static int ctrl_cmd_new_server(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_lookup *lookup; struct qrtr_server *srv; struct list_head *li; int ret = 0; /* Ignore specified node and port for local servers */ if (from->sq_node == qrtr_ns.local_node) { node_id = from->sq_node; port = from->sq_port; } srv = server_add(service, instance, node_id, port); if (!srv) return -EINVAL; if (srv->node == qrtr_ns.local_node) { ret = service_announce_new(&qrtr_ns.bcast_sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } } /* Notify any potential lookups about the new server */ list_for_each(li, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->service && lookup->service != service) continue; if (lookup->instance && lookup->instance != instance) continue; lookup_notify(&lookup->sq, srv, true); } return ret; } static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_node *node; /* Ignore specified node and port for local servers*/ if (from->sq_node == qrtr_ns.local_node) { node_id = from->sq_node; port = from->sq_port; } /* Local servers may only unregister themselves */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; node = node_get(node_id); if (!node) return -ENOENT; return server_del(node, port, true); } static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { struct qrtr_server_filter filter; struct qrtr_lookup *lookup; struct qrtr_server *srv; struct qrtr_node *node; unsigned long node_idx; unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) return -EINVAL; lookup = kzalloc(sizeof(*lookup), GFP_KERNEL); if (!lookup) return -ENOMEM; lookup->sq = *from; lookup->service = service; lookup->instance = instance; list_add_tail(&lookup->li, &qrtr_ns.lookups); memset(&filter, 0, sizeof(filter)); filter.service = service; filter.instance = instance; xa_for_each(&nodes, node_idx, node) { xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; lookup_notify(from, srv, true); } } /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); return 0; } static void ctrl_cmd_del_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { struct qrtr_lookup *lookup; struct list_head *tmp; struct list_head *li; list_for_each_safe(li, tmp, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->sq.sq_node != from->sq_node) continue; if (lookup->sq.sq_port != from->sq_port) continue; if (lookup->service != service) continue; if (lookup->instance && lookup->instance != instance) continue; list_del(&lookup->li); kfree(lookup); } } static void qrtr_ns_worker(struct work_struct *work) { const struct qrtr_ctrl_pkt *pkt; size_t recv_buf_size = 4096; struct sockaddr_qrtr sq; struct msghdr msg = { }; unsigned int cmd; ssize_t msglen; void *recv_buf; struct kvec iv; int ret; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); recv_buf = kzalloc(recv_buf_size, GFP_KERNEL); if (!recv_buf) return; for (;;) { iv.iov_base = recv_buf; iv.iov_len = recv_buf_size; msglen = kernel_recvmsg(qrtr_ns.sock, &msg, &iv, 1, iv.iov_len, MSG_DONTWAIT); if (msglen == -EAGAIN) break; if (msglen < 0) { pr_err("error receiving packet: %zd\n", msglen); break; } pkt = recv_buf; cmd = le32_to_cpu(pkt->cmd); if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) && qrtr_ctrl_pkt_strings[cmd]) trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd], sq.sq_node, sq.sq_port); ret = 0; switch (cmd) { case QRTR_TYPE_HELLO: ret = ctrl_cmd_hello(&sq); break; case QRTR_TYPE_BYE: ret = ctrl_cmd_bye(&sq); break; case QRTR_TYPE_DEL_CLIENT: ret = ctrl_cmd_del_client(&sq, le32_to_cpu(pkt->client.node), le32_to_cpu(pkt->client.port)); break; case QRTR_TYPE_NEW_SERVER: ret = ctrl_cmd_new_server(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance), le32_to_cpu(pkt->server.node), le32_to_cpu(pkt->server.port)); break; case QRTR_TYPE_DEL_SERVER: ret = ctrl_cmd_del_server(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance), le32_to_cpu(pkt->server.node), le32_to_cpu(pkt->server.port)); break; case QRTR_TYPE_EXIT: case QRTR_TYPE_PING: case QRTR_TYPE_RESUME_TX: break; case QRTR_TYPE_NEW_LOOKUP: ret = ctrl_cmd_new_lookup(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance)); break; case QRTR_TYPE_DEL_LOOKUP: ctrl_cmd_del_lookup(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance)); break; } if (ret < 0) pr_err("failed while handling packet from %d:%d", sq.sq_node, sq.sq_port); } kfree(recv_buf); } static void qrtr_ns_data_ready(struct sock *sk) { trace_sk_data_ready(sk); queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } int qrtr_ns_init(void) { struct sockaddr_qrtr sq; int ret; INIT_LIST_HEAD(&qrtr_ns.lookups); INIT_WORK(&qrtr_ns.work, qrtr_ns_worker); ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, PF_QIPCRTR, &qrtr_ns.sock); if (ret < 0) return ret; ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq); if (ret < 0) { pr_err("failed to get socket name\n"); goto err_sock; } qrtr_ns.workqueue = alloc_ordered_workqueue("qrtr_ns_handler", 0); if (!qrtr_ns.workqueue) { ret = -ENOMEM; goto err_sock; } qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; sq.sq_port = QRTR_PORT_CTRL; qrtr_ns.local_node = sq.sq_node; ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); goto err_wq; } qrtr_ns.bcast_sq.sq_family = AF_QIPCRTR; qrtr_ns.bcast_sq.sq_node = QRTR_NODE_BCAST; qrtr_ns.bcast_sq.sq_port = QRTR_PORT_CTRL; ret = say_hello(&qrtr_ns.bcast_sq); if (ret < 0) goto err_wq; return 0; err_wq: destroy_workqueue(qrtr_ns.workqueue); err_sock: sock_release(qrtr_ns.sock); return ret; } EXPORT_SYMBOL_GPL(qrtr_ns_init); void qrtr_ns_remove(void) { cancel_work_sync(&qrtr_ns.work); destroy_workqueue(qrtr_ns.workqueue); sock_release(qrtr_ns.sock); } EXPORT_SYMBOL_GPL(qrtr_ns_remove); MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>"); MODULE_DESCRIPTION("Qualcomm IPC Router Nameservice"); MODULE_LICENSE("Dual BSD/GPL"); |
9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | // SPDX-License-Identifier: GPL-2.0-only /* * crc16.c */ #include <linux/types.h> #include <linux/module.h> #include <linux/crc16.h> /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ u16 const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; EXPORT_SYMBOL(crc16_table); /** * crc16 - compute the CRC-16 for the data buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer * * Returns the updated CRC value. */ u16 crc16(u16 crc, u8 const *buffer, size_t len) { while (len--) crc = crc16_byte(crc, *buffer++); return crc; } EXPORT_SYMBOL(crc16); MODULE_DESCRIPTION("CRC16 calculations"); MODULE_LICENSE("GPL"); |
1120 1565 1146 1119 1119 1119 1119 1120 904 1119 984 983 2488 2486 1118 1118 1118 1119 1119 2488 1119 1119 1119 1119 1119 1119 1119 1118 1118 1118 1118 1118 1118 1118 1118 1116 1116 1118 1118 1116 1118 82 81 81 82 81 81 1120 1120 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 26 1093 1119 1119 263 867 867 1119 1119 1119 1119 1119 1119 1 1129 15 1130 1129 1125 1121 1121 1121 51 1120 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1119 1131 1 1130 912 878 1130 16 1131 1131 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/fork.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'fork.c' contains the help-routines for the 'fork' system call * (see also entry.S and others). * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ #include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/user.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/kmsan.h> #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/seccomp.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/proc_fs.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> #include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/tty.h> #include <linux/fs_struct.h> #include <linux/magic.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> #include <linux/stackleak.h> #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> #include <linux/stackprotector.h> #include <linux/user_events.h> #include <linux/iommu.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <trace/events/sched.h> #define CREATE_TRACE_POINTS #include <trace/events/task.h> /* * Minimum number of threads to boot the kernel */ #define MIN_THREADS 20 /* * Maximum number of threads */ #define MAX_THREADS FUTEX_TID_MASK /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ #define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) static const char * const resident_page_types[] = { NAMED_ARRAY_INDEX(MM_FILEPAGES), NAMED_ARRAY_INDEX(MM_ANONPAGES), NAMED_ARRAY_INDEX(MM_SWAPENTS), NAMED_ARRAY_INDEX(MM_SHMEMPAGES), }; DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) { return lockdep_is_held(&tasklist_lock); } EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); #endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { int cpu; int total = 0; for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; } void __weak arch_release_task_struct(struct task_struct *tsk) { } #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) { return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } #endif #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) # ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; static bool try_release_thread_stack_to_cache(struct vm_struct *vm) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) continue; return true; } return false; } static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; vfree(vm_stack); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct vm_stack *vm_stack = tsk->stack; vm_stack->stack_vm_area = tsk->stack_vm_area; call_rcu(&vm_stack->rcu, thread_stack_free_rcu); } static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *vm_stack = cached_vm_stacks[i]; if (!vm_stack) continue; vfree(vm_stack->addr); cached_vm_stacks[i] = NULL; } return 0; } static int memcg_charge_kernel_stack(struct vm_struct *vm) { int i; int ret; int nr_charged = 0; BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; } return 0; err: for (i = 0; i < nr_charged; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *s; s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); stack = kasan_reset_tag(s->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); if (memcg_charge_kernel_stack(s)) { vfree(s->addr); return -ENOMEM; } tsk->stack_vm_area = s; tsk->stack = stack; return 0; } /* * Allocated stacks are cached and later reused by new threads, * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; vm = find_vm_area(stack); if (memcg_charge_kernel_stack(vm)) { vfree(stack); return -ENOMEM; } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ tsk->stack_vm_area = vm; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; } static void free_thread_stack(struct task_struct *tsk) { if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) thread_stack_delayed_free(tsk); tsk->stack = NULL; tsk->stack_vm_area = NULL; } # else /* !CONFIG_VMAP_STACK */ static void thread_stack_free_rcu(struct rcu_head *rh) { __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct rcu_head *rh = tsk->stack; call_rcu(rh, thread_stack_free_rcu); } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); return 0; } return -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { thread_stack_delayed_free(tsk); tsk->stack = NULL; } # endif /* CONFIG_VMAP_STACK */ # else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ static struct kmem_cache *thread_stack_cache; static void thread_stack_free_rcu(struct rcu_head *rh) { kmem_cache_free(thread_stack_cache, rh); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct rcu_head *rh = tsk->stack; call_rcu(rh, thread_stack_free_rcu); } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { thread_stack_delayed_free(tsk); tsk->stack = NULL; } void thread_stack_cache_init(void) { thread_stack_cache = kmem_cache_create_usercopy("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, 0, THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ #else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = arch_alloc_thread_stack_node(tsk, node); tsk->stack = stack; return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { arch_free_thread_stack(tsk); tsk->stack = NULL; } #endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; #ifdef CONFIG_PER_VMA_LOCK /* SLAB cache for vm_area_struct.lock */ static struct kmem_cache *vma_lock_cachep; static bool vma_lock_alloc(struct vm_area_struct *vma) { vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); if (!vma->vm_lock) return false; init_rwsem(&vma->vm_lock->lock); vma->vm_lock_seq = -1; return true; } static inline void vma_lock_free(struct vm_area_struct *vma) { kmem_cache_free(vma_lock_cachep, vma->vm_lock); } #else /* CONFIG_PER_VMA_LOCK */ static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } static inline void vma_lock_free(struct vm_area_struct *vma) {} #endif /* CONFIG_PER_VMA_LOCK */ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) return NULL; vma_init(vma, mm); if (!vma_lock_alloc(vma)) { kmem_cache_free(vm_area_cachep, vma); return NULL; } return vma; } struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return NULL; ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); /* * orig->shared.rb may be modified concurrently, but the clone * will be reinitialized. */ data_race(memcpy(new, orig, sizeof(*new))); if (!vma_lock_alloc(new)) { kmem_cache_free(vm_area_cachep, new); return NULL; } INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); return new; } void __vm_area_free(struct vm_area_struct *vma) { vma_numab_state_free(vma); free_anon_vma_name(vma); vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } #ifdef CONFIG_PER_VMA_LOCK static void vm_area_free_rcu_cb(struct rcu_head *head) { struct vm_area_struct *vma = container_of(head, struct vm_area_struct, vm_rcu); /* The vma should not be locked while being destroyed. */ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); __vm_area_free(vma); } #endif void vm_area_free(struct vm_area_struct *vma) { #ifdef CONFIG_PER_VMA_LOCK call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); #else __vm_area_free(vma); #endif } static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } void exit_task_stack_account(struct task_struct *tsk) { account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm; int i; vm = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); } } static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ free_thread_stack(tsk); } #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif void free_task(struct task_struct *tsk) { #ifdef CONFIG_SECCOMP WARN_ON_ONCE(tsk->seccomp.filter); #endif release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, * so free both. */ release_task_stack(tsk); #else /* * If the task had a separate stack allocation, it should be gone * by now. */ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); /* * We depend on the oldmm having properly denied write access to the * exe_file already. */ if (exe_file && deny_write_access(exe_file)) pr_warn_once("deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); VMA_ITERATOR(old_vmi, oldmm, 0); VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { retval = -EINTR; goto fail_uprobe_end; } flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; retval = ksm_fork(mm, oldmm); if (retval) goto out; khugepaged_fork(mm, oldmm); retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); if (retval) goto out; mt_clear_in_rcu(vmi.mas.tree); for_each_vma(old_vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; /* * Don't duplicate many vmas if we've been oom-killed (for * example) */ if (fatal_signal_pending(current)) { retval = -EINTR; goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { /* * VM_WIPEONFORK gets a clean slate in the child. * Don't prepare anon_vma until fault since we don't * copy page for current vma. */ tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; get_file(file); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } /* * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ if (vma_iter_bulk_store(&vmi, tmp)) goto fail_nomem_vmi_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto loop_out; } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); if (!retval) mt_set_in_rcu(vmi.mas.tree); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; fail_nomem_vmi_store: unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto loop_out; } static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; return 0; } static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { mmap_write_lock(oldmm); dup_mm_exe_file(mm, oldmm); mmap_write_unlock(oldmm); return 0; } #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ static void check_mm(struct mm_struct *mm) { int i; BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) static void do_check_lazy_tlb(void *arg) { struct mm_struct *mm = arg; WARN_ON_ONCE(current->active_mm == mm); } static void do_shoot_lazy_tlb(void *arg) { struct mm_struct *mm = arg; if (current->active_mm == mm) { WARN_ON_ONCE(current->mm); current->active_mm = &init_mm; switch_mm(mm, &init_mm, current); } } static void cleanup_lazy_tlbs(struct mm_struct *mm) { if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { /* * In this case, lazy tlb mms are refounted and would not reach * __mmdrop until all CPUs have switched away and mmdrop()ed. */ return; } /* * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it * requires lazy mm users to switch to another mm when the refcount * drops to zero, before the mm is freed. This requires IPIs here to * switch kernel threads to init_mm. * * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm * switch with the final userspace teardown TLB flush which leaves the * mm lazy on this CPU but no others, reducing the need for additional * IPIs here. There are cases where a final IPI is still required here, * such as the final mmdrop being performed on a different CPU than the * one exiting, or kernel threads using the mm when userspace exits. * * IPI overheads have not found to be expensive, but they could be * reduced in a number of possible ways, for example (roughly * increasing order of complexity): * - The last lazy reference created by exit_mm() could instead switch * to init_mm, however it's probable this will run on the same CPU * immediately afterwards, so this may not reduce IPIs much. * - A batch of mms requiring IPIs could be gathered and freed at once. * - CPUs store active_mm where it can be remotely checked without a * lock, to filter out false-positives in the cpumask. * - After mm_users or mm_count reaches zero, switching away from the * mm could clear mm_cpumask to reduce some IPIs, perhaps together * with some batching or delaying of the final IPIs. * - A delayed freeing and RCU-like quiescing sequence based on mm * switching to avoid IPIs completely. */ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); } /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); /* Ensure no CPUs are using this as their lazy tlb mm */ cleanup_lazy_tlbs(mm); WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; mm = container_of(work, struct mm_struct, async_put_work); __mmdrop(mm); } static void mmdrop_async(struct mm_struct *mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) { INIT_WORK(&mm->async_put_work, mmdrop_async_fn); schedule_work(&mm->async_put_work); } } static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); /* * __mmdrop is not safe to call from softirq context on x86 due to * pgd_dtor so postpone it to the async context */ if (sig->oom_mm) mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } static inline void put_signal_struct(struct signal_struct *sig) { if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); void __put_task_struct_rcu_cb(struct rcu_head *rhp) { struct task_struct *task = container_of(rhp, struct task_struct, rcu); __put_task_struct(task); } EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; unsigned long nr_pages = totalram_pages(); /* * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) threads = max_threads_suggested; max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT /* Initialized by the architecture: */ int arch_task_struct_size __read_mostly; #endif #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ arch_thread_struct_whitelist(offset, size); /* * Handle zero-sized whitelist or empty thread_struct, otherwise * adjust offset to position of thread_struct in task_struct. */ if (unlikely(*size == 0)) *offset = 0; else *offset += offsetof(struct task_struct, thread); } #endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ task_struct_whitelist(&useroffset, &usersize); task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); #endif /* do the arch specific task caches init */ arch_task_cache_init(); set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif scs_init(); lockdep_init_task(&init_task); uprobes_init(); } int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; return 0; } void set_task_stack_end_magic(struct task_struct *tsk) { unsigned long *stackend; stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ } static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; int err; if (node == NUMA_NO_NODE) node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; err = arch_dup_task_struct(tsk, orig); if (err) goto free_tsk; err = alloc_thread_stack_node(tsk, node); if (err) goto free_tsk; #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) goto free_stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under * the sighand lock in case orig has changed between now and * then. Until then, filter must be NULL to avoid messing up * the usage counts on the error path calling free_task. */ tsk->seccomp.filter = NULL; #endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. * One for the scheduler. */ refcount_set(&tsk->rcu_users, 2); /* One for the rcu users */ refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; tsk->worker_private = NULL; kcov_task_init(tsk); kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; #endif #ifdef CONFIG_BLK_CGROUP tsk->throttle_disk = NULL; tsk->use_memdelay = 0; #endif #ifdef CONFIG_IOMMU_SVA tsk->pasid_activated = 0; #endif #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif #ifdef CONFIG_CPU_SUP_INTEL tsk->reported_split_lock = 0; #endif #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid = -1; tsk->last_mm_cid = -1; tsk->mm_cid_active = 0; tsk->migrate_from_cpu = -1; #endif return tsk; free_stack: exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; } __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) { default_dump_filter = (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & MMF_DUMP_FILTER_MASK; return 1; } __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); mm->ioctx_table = NULL; #endif } static __always_inline void mm_clear_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG if (mm->owner == p) WRITE_ONCE(mm->owner, NULL); #endif } static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG mm->owner = p; #endif } static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; #endif } static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); #ifdef CONFIG_PER_VMA_LOCK mm->mm_lock_seq = 0; #endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); hugetlb_count_init(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { mm->flags = default_dump_filter; mm->def_flags = 0; } if (mm_alloc_pgd(mm)) goto fail_nopgd; if (init_new_context(p, mm)) goto fail_nocontext; if (mm_alloc_cid(mm)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: free_mm(mm); return NULL; } /* * Allocate and initialize an mm_struct. */ struct mm_struct *mm_alloc(void) { struct mm_struct *mm; mm = allocate_mm(); if (!mm) return NULL; memset(mm, 0, sizeof(*mm)); return mm_init(mm, current, current_user_ns()); } static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); mmdrop(mm); } /* * Decrement the use count and release all resources for an mm. */ void mmput(struct mm_struct *mm) { might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } EXPORT_SYMBOL_GPL(mmput); #ifdef CONFIG_MMU static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); __mmput(mm); } void mmput_async(struct mm_struct *mm) { if (atomic_dec_and_test(&mm->mm_users)) { INIT_WORK(&mm->async_put_work, mmput_async_fn); schedule_work(&mm->async_put_work); } } EXPORT_SYMBOL_GPL(mmput_async); #endif /** * set_mm_exe_file - change a reference to the mm's executable file * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve it happens before * the new mm is made visible to anyone. * * Can only fail if new_exe_file != NULL. */ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; /* * It is safe to dereference the exe_file without RCU as * this function is only called if nobody else can access * this mm -- see comment above for justification. */ old_exe_file = rcu_dereference_raw(mm->exe_file); if (new_exe_file) { /* * We expect the caller (i.e., sys_execve) to already denied * write access, so this is unlikely to fail. */ if (unlikely(deny_write_access(new_exe_file))) return -EACCES; get_file(new_exe_file); } rcu_assign_pointer(mm->exe_file, new_exe_file); if (old_exe_file) { allow_write_access(old_exe_file); fput(old_exe_file); } return 0; } /** * replace_mm_exe_file - replace a reference to the mm's executable file * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). */ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct vm_area_struct *vma; struct file *old_exe_file; int ret = 0; /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, &old_exe_file->f_path)) { ret = -EBUSY; break; } } mmap_read_unlock(mm); fput(old_exe_file); if (ret) return ret; } ret = deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); /* set the new file */ mmap_write_lock(mm); old_exe_file = rcu_dereference_raw(mm->exe_file); rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); if (old_exe_file) { allow_write_access(old_exe_file); fput(old_exe_file); } return 0; } /** * get_mm_exe_file - acquire a reference to the mm's executable file * * Returns %NULL if mm has no associated executable file. * User must release file via fput(). */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; rcu_read_lock(); exe_file = rcu_dereference(mm->exe_file); if (exe_file && !get_file_rcu(exe_file)) exe_file = NULL; rcu_read_unlock(); return exe_file; } /** * get_task_exe_file - acquire a reference to the task's executable file * * Returns %NULL if task's mm (if any) has no associated executable file or * this is a kernel thread with borrowed mm (see the comment above get_task_mm). * User must release file via fput(). */ struct file *get_task_exe_file(struct task_struct *task) { struct file *exe_file = NULL; struct mm_struct *mm; task_lock(task); mm = task->mm; if (mm) { if (!(task->flags & PF_KTHREAD)) exe_file = get_mm_exe_file(mm); } task_unlock(task); return exe_file; } /** * get_task_mm - acquire a reference to the task's mm * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. */ struct mm_struct *get_task_mm(struct task_struct *task) { struct mm_struct *mm; task_lock(task); mm = task->mm; if (mm) { if (task->flags & PF_KTHREAD) mm = NULL; else mmget(mm); } task_unlock(task); return mm; } EXPORT_SYMBOL_GPL(get_task_mm); struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; int err; err = down_read_killable(&task->signal->exec_update_lock); if (err) return ERR_PTR(err); mm = get_task_mm(task); if (mm && mm != current->mm && !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } up_read(&task->signal->exec_update_lock); return mm; } static void complete_vfork_done(struct task_struct *tsk) { struct completion *vfork; task_lock(tsk); vfork = tsk->vfork_done; if (likely(vfork)) { tsk->vfork_done = NULL; complete(vfork); } task_unlock(tsk); } static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; int killed; cgroup_enter_frozen(); killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); if (killed) { task_lock(child); child->vfork_done = NULL; task_unlock(child); } put_task_struct(child); return killed; } /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. * * mm_release is called after a mm_struct has been removed * from the current process. * * This difference is important for error handling, when we * only half set up a mm_struct for a new process and need to restore * the old one. Because we mmput the new mm_struct before * restoring the old one. . . * Eric Biederman 10 January 1998 */ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { uprobe_free_utask(tsk); /* Get rid of any cached register state */ deactivate_mm(tsk, mm); /* * Signal userspace if we're not exiting with a core dump * because we want to leave the value intact for debugging * purposes. */ if (tsk->clear_child_tid) { if (atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ put_user(0, tsk->clear_child_tid); do_futex(tsk->clear_child_tid, FUTEX_WAKE, 1, NULL, NULL, 0, 0); } tsk->clear_child_tid = NULL; } /* * All done, finally we can wake up parent and return this mm to him. * Also kthread_stop() uses this completion for synchronization. */ if (tsk->vfork_done) complete_vfork_done(tsk); } void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) { futex_exit_release(tsk); mm_release(tsk, mm); } void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) { futex_exec_release(tsk); mm_release(tsk, mm); } /** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. * @oldmm: the mm to duplicate. * * Allocates a new mm structure and duplicates the provided @oldmm structure * content into it. * * Return: the duplicated mm or NULL on failure. */ static struct mm_struct *dup_mm(struct task_struct *tsk, struct mm_struct *oldmm) { struct mm_struct *mm; int err; mm = allocate_mm(); if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; err = dup_mmap(mm, oldmm); if (err) goto free_pt; mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); fail_nomem: return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; tsk->last_switch_time = 0; #endif tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm) return 0; if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; } else { mm = dup_mm(tsk, current->mm); if (!mm) return -ENOMEM; } tsk->mm = mm; tsk->active_mm = mm; sched_mm_cid_fork(tsk); return 0; } static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); if (fs->in_exec) { spin_unlock(&fs->lock); return -EAGAIN; } fs->users++; spin_unlock(&fs->lock); return 0; } tsk->fs = copy_fs_struct(fs); if (!tsk->fs) return -ENOMEM; return 0; } static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) goto out; if (no_files) { tsk->files = NULL; goto out; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; } newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; tsk->files = newf; error = 0; out: return error; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { refcount_inc(¤t->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; refcount_set(&sig->count, 1); spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ if (clone_flags & CLONE_CLEAR_SIGHAND) flush_signal_handlers(tsk, 0); return 0; } void __cleanup_sighand(struct sighand_struct *sighand) { if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); } } /* * Initialize POSIX timer handling for a thread group. */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { struct posix_cputimers *pct = &sig->posix_cputimers; unsigned long cpu_limit; cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); posix_cputimers_group_init(pct, cpu_limit); } static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; if (clone_flags & CLONE_THREAD) return 0; sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; sig->nr_threads = 1; sig->quick_threads = 1; atomic_set(&sig->live, 1); refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); tty_audit_fork(sig); sched_autogroup_fork(sig); sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); init_rwsem(&sig->exec_update_lock); return 0; } static void copy_seccomp(struct task_struct *p) { #ifdef CONFIG_SECCOMP /* * Must be called with sighand->lock held, which is common to * all threads in the group. Holding cred_guard_mutex is not * needed because this new task is not yet running and cannot * be racing exec. */ assert_spin_locked(¤t->sighand->siglock); /* Ref-count the new filter user, and assign it. */ get_seccomp_filter(current); p->seccomp = current->seccomp; /* * Explicitly enable no_new_privs here in case it got set * between the task_struct being duplicated and holding the * sighand lock. The seccomp state and nnp must be in sync. */ if (task_no_new_privs(current)) task_set_no_new_privs(p); /* * If the parent gained a seccomp mode after copying thread * flags and between before we held the sighand lock, we have * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) set_task_syscall_work(p, SECCOMP); #endif } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; return task_pid_vnr(current); } static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } static inline void init_task_pid_links(struct task_struct *task) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) INIT_HLIST_NODE(&task->pid_links[type]); } static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { if (type == PIDTYPE_PID) task->thread_pid = pid; else task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) { #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special.s = 0; p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) { if (file->f_op == &pidfd_fops) return file->private_data; return ERR_PTR(-EBADF); } static int pidfd_release(struct inode *inode, struct file *file) { struct pid *pid = file->private_data; file->private_data = NULL; put_pid(pid); return 0; } #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd * @m: proc fdinfo file * @f: file referencing a pidfd * * Pid: * This function will print the pid that a given pidfd refers to in the * pid namespace of the procfs instance. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its pid. This is * similar to calling getppid() on a process whose parent is outside of * its pid namespace. * * NSpid: * If pid namespaces are supported then this function will also print * the pid of a given pidfd refers to for all descendant pid namespaces * starting from the current pid namespace of the instance, i.e. the * Pid field and the first entry in the NSpid field will be identical. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its first NSpid * entry and no others will be shown. * Note that this differs from the Pid and NSpid fields in * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the * new pid namespace and mount procfs) * - create a process with a pidfd in ns1 * - send pidfd from ns1 to ns2 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid * have exactly one entry, which is 0 */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid *pid = f->private_data; struct pid_namespace *ns; pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that * ns, i.e. the pid namespace associated with the procfs * instance, is in the pid namespace hierarchy of pid. * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); } #endif /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = file->private_data; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ if (thread_group_exited(pid)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } const struct file_operations pidfd_fops = { .release = pidfd_release, .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif }; /** * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd * @pidfd: the pidfd to return * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * The helper doesn't perform checks on @pid which makes it useful for pidfds * created via CLONE_PIDFD where @pid has no task attached when the pidfd and * pidfd file are prepared. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in * order to install the pidfd into its file descriptor table or they must use * put_unused_fd() and fput() on the returned pidfd and pidfd file * respectively. * * This function is useful when a pidfd must already be reserved but there * might still be points of failure afterwards and the caller wants to ensure * that no pidfd is leaked into its file descriptor table. * * Return: On success, a reserved pidfd is returned from the function and a new * pidfd file is returned in the last argument to the function. On * error, a negative error code is returned from the function and the * last argument remains unchanged. */ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { int pidfd; struct file *pidfd_file; if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) return -EINVAL; pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (pidfd < 0) return pidfd; pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, flags | O_RDWR | O_CLOEXEC); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); } get_pid(pid); /* held by pidfd_file now */ *ret = pidfd_file; return pidfd; } /** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd * @pidfd: the pidfd to return * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * * The helper verifies that @pid is used as a thread group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in * order to install the pidfd into its file descriptor table or they must use * put_unused_fd() and fput() on the returned pidfd and pidfd file * respectively. * * This function is useful when a pidfd must already be reserved but there * might still be points of failure afterwards and the caller wants to ensure * that no pidfd is leaked into its file descriptor table. * * Return: On success, a reserved pidfd is returned from the function and a new * pidfd file is returned in the last argument to the function. On * error, a negative error code is returned from the function and the * last argument remains unchanged. */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); } static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); free_task(tsk); } static __always_inline void delayed_free_task(struct task_struct *tsk) { if (IS_ENABLED(CONFIG_MEMCG)) call_rcu(&tsk->rcu, __delayed_free_task); else free_task(tsk); } static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) { /* Skip if kernel thread */ if (!tsk->mm) return; /* Skip if spawning a thread or using vfork */ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) return; /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_unlock(&oom_adj_mutex); } #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { int i; for (i = 0; i < RV_PER_TASK_MONITORS; i++) p->rv[i].da_mon.monitoring = false; } #else #define rv_task_fork(p) do {} while (0) #endif /* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ __latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different * namespace */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); /* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group with the forking task. */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != nsp->pid_ns_for_children)) return ERR_PTR(-EINVAL); } if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. * - CLONE_THREAD is blocked until someone really needs it. */ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) return ERR_PTR(-EINVAL); } /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple * processes that happen during the fork and delay them so that * they appear to happen after the fork. */ sigemptyset(&delayed.signal); INIT_HLIST_NODE(&delayed.node); spin_lock_irq(¤t->sighand->siglock); if (!(clone_flags & CLONE_THREAD)) hlist_add_head(&delayed.node, ¤t->signal->multiprocess); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); retval = -ERESTARTNOINTR; if (task_sigpending(current)) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) goto fork_out; p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; if (args->user_worker) { /* * Mark us a user worker, and block any signal that isn't * fatal or STOP */ p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } if (args->io_thread) p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); rt_mutex_init_task(p); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; retval = -EAGAIN; if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; #endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime.seqcount); p->vtime.starttime = 0; p->vtime.state = VTIME_INACTIVE; #endif #ifdef CONFIG_IO_URING p->io_uring = NULL; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI p->psi_flags = 0; #endif task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS memset(&p->irqtrace, 0, sizeof(p->irqtrace)); p->irqtrace.hardirq_disable_ip = _THIS_IP_; p->irqtrace.softirq_enable_ip = _THIS_IP_; p->softirqs_enabled = 1; p->softirq_context = 0; #endif p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; #endif #ifdef CONFIG_BPF_SYSCALL RCU_INIT_POINTER(p->bpf_storage, NULL); p->bpf_ctx = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) goto bad_fork_cleanup_policy; retval = perf_event_init_task(p, clone_flags); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal; retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; stackleak_task_init(p); if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; } } /* * This has to happen after we've potentially unshared the file * descriptor table (so that the pidfd doesn't leak into the child * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } #ifdef CONFIG_BLOCK p->plug = NULL; #endif futex_init_task(p); /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; p->tgid = current->tgid; } else { p->group_leader = p; p->tgid = p->pid; } p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; clear_posix_cputimers_work(p); #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif #ifdef CONFIG_RETHOOK p->rethooks.first = NULL; #endif /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd; /* * Now that the cgroups are pinned, re-clone the parent cgroup and put * the new task on the correct runqueue. All this *before* the task * becomes visible. * * This isn't part of ->can_fork() because while the re-cloning is * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ sched_cgroup_fork(p, args); /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by * stalling fork(2) after we recorded the start_time but before it is * visible to the system. */ p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; if (clone_flags & CLONE_THREAD) p->exit_signal = -1; else p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; p->exit_signal = args->exit_signal; } klp_copy_process(p); sched_core_fork(p); spin_lock(¤t->sighand->siglock); rv_task_fork(p); rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; goto bad_fork_cancel_cgroup; } /* No more failure paths after this point. */ /* * Copy seccomp details explicitly here, in case they were changed * before holding sighand lock. */ copy_seccomp(p); init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same * tasklist_lock with adding child to the process tree * for propagate_has_child_subreaper optimization. */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; current->signal->quick_threads++; atomic_inc(¤t->signal->live); refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; } total_forks++; hlist_del_init(&delayed.node); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); if (pidfile) fd_install(pidfd, pidfile); proc_fork_connector(p); sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); user_events_fork(p, clone_flags); copy_oom_score_adj(clone_flags, p); return p; bad_fork_cancel_cgroup: sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); put_unused_fd(pidfd); } bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { mm_clear_owner(p->mm, p); mmput(p->mm); } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); #endif bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); spin_unlock_irq(¤t->sighand->siglock); return ERR_PTR(retval); } static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ init_task_pid(idle, type, &init_struct_pid); } } static int idle_dummy(void *dummy) { /* This function is never called */ return 0; } struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { .flags = CLONE_VM, .fn = &idle_dummy, .fn_arg = NULL, .kthread = 1, .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); } return task; } /* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. * The returned task is inactive, and the caller must fire it up through * wake_up_new_task(p). All signals are blocked in the created task. */ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| CLONE_IO; struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, .io_thread = 1, .user_worker = 1, }; return copy_process(NULL, 0, node, &args); } /* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. * * args->exit_signal is expected to be checked for sanity by the caller. */ pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate * field in struct clone_args and it still doesn't make sense to have * them both point at the same memory location. Performing this check * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ if ((args->flags & CLONE_PIDFD) && (args->flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL; /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) return PTR_ERR(p); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ trace_sched_process_fork(current, p); pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); task_unlock(p); } wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } put_pid(pid); return nr; } /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, .kthread = 1, }; return kernel_clone(&args); } /* * Create a user mode thread. */ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, }; return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU struct kernel_clone_args args = { .exit_signal = SIGCHLD, }; return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; #endif } #endif #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { struct kernel_clone_args args = { .flags = CLONE_VFORK | CLONE_VM, .exit_signal = SIGCHLD, }; return kernel_clone(&args); } #endif #ifdef __ARCH_WANT_SYS_CLONE #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #endif { struct kernel_clone_args args = { .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = newsp, .tls = tls, }; return kernel_clone(&args); } #endif #ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; struct clone_args args; pid_t *kset_tid = kargs->set_tid; BUILD_BUG_ON(offsetofend(struct clone_args, tls) != CLONE_ARGS_SIZE_VER0); BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != CLONE_ARGS_SIZE_VER1); BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != CLONE_ARGS_SIZE_VER2); BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (err) return err; if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) return -EINVAL; if (unlikely(!args.set_tid && args.set_tid_size > 0)) return -EINVAL; if (unlikely(args.set_tid && args.set_tid_size == 0)) return -EINVAL; /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || !valid_signal(args.exit_signal))) return -EINVAL; if ((args.flags & CLONE_INTO_CGROUP) && (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) return -EINVAL; *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), .child_tid = u64_to_user_ptr(args.child_tid), .parent_tid = u64_to_user_ptr(args.parent_tid), .exit_signal = args.exit_signal, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, .cgroup = args.cgroup, }; if (args.set_tid && copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), (kargs->set_tid_size * sizeof(pid_t)))) return -EFAULT; kargs->set_tid = kset_tid; return 0; } /** * clone3_stack_valid - check and prepare stack * @kargs: kernel clone args * * Verify that the stack arguments userspace gave us are sane. * In addition, set the stack direction for userspace since it's easy for us to * determine. */ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) { if (kargs->stack == 0) { if (kargs->stack_size > 0) return false; } else { if (kargs->stack_size == 0) return false; if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) return false; #if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) kargs->stack += kargs->stack_size; #endif } return true; } static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* * - make the CLONE_DETACHED bit reusable for clone3 * - make the CSIGNAL bits reusable for clone3 */ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME)))) return false; if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) return false; if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; if (!clone3_stack_valid(kargs)) return false; return true; } /** * clone3 - create a new process with specific properties * @uargs: argument structure * @size: size of @uargs * * clone3() is the extensible successor to clone()/clone2(). * It takes a struct as argument that is versioned by its size. * * Return: On success, a positive PID for the child process. * On error, a negative errno number. */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; struct kernel_clone_args kargs; pid_t set_tid[MAX_PID_NS_LEVEL]; kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) return err; if (!clone3_args_valid(&kargs)) return -EINVAL; return kernel_clone(&kargs); } #endif void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) { struct task_struct *leader, *parent, *child; int res; read_lock(&tasklist_lock); leader = top = top->group_leader; down: for_each_thread(leader, parent) { list_for_each_entry(child, &parent->children, sibling) { res = visitor(child, data); if (res) { if (res < 0) goto out; leader = child; goto down; } up: ; } } if (leader != top) { child = leader; parent = child->real_parent; leader = parent->group_leader; goto up; } out: read_unlock(&tasklist_lock); } #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); init_waitqueue_head(&sighand->signalfd_wqh); } void __init mm_cache_init(void) { unsigned int mm_size; /* * The mm_cpumask is located at the end of mm_struct, and is * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv), NULL); } void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); #ifdef CONFIG_PER_VMA_LOCK vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); #endif mmap_init(); nsproxy_cache_init(); } /* * Check constraints on flags passed to the unshare system call. */ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing * to unshare. Note that unsharing the address space or the * signal handlers also need to unshare the signal queues (aka * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { if (!thread_group_empty(current)) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { if (refcount_read(¤t->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { if (!current_is_single_threaded()) return -EINVAL; } return 0; } /* * Unshare the filesystem structure if it is being shared */ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; if (!(unshare_flags & CLONE_FS) || !fs) return 0; /* don't need lock here; in the worst case we'll do useless copy */ if (fs->users == 1) return 0; *new_fsp = copy_fs_struct(fs); if (!*new_fsp) return -ENOMEM; return 0; } /* * Unshare file descriptor table if it is being shared */ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } return 0; } /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. */ int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *new_fd = NULL; struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; /* * If unsharing a user namespace must also unshare the thread group * and unshare the filesystem root and working directories. */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; /* * If unsharing a signal handlers, must also unshare the signal queues. */ if (unshare_flags & CLONE_SIGHAND) unshare_flags |= CLONE_THREAD; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_cred, new_fs); if (err) goto bad_unshare_cleanup_cred; if (new_cred) { err = set_cred_ucounts(new_cred); if (err) goto bad_unshare_cleanup_cred; } if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). */ exit_sem(current); } if (unshare_flags & CLONE_NEWIPC) { /* Orphan segments in old ns (see sem above). */ exit_shm(current); shm_init_task(current); } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); task_lock(current); if (new_fs) { fs = current->fs; spin_lock(&fs->lock); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; spin_unlock(&fs->lock); } if (new_fd) swap(current->files, new_fd); task_unlock(current); if (new_cred) { /* Install the new user namespace */ commit_creds(new_cred); new_cred = NULL; } } perf_event_namespaces(current); bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); bad_unshare_out: return err; } SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { return ksys_unshare(unshare_flags); } /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to * the exec layer of the kernel. */ int unshare_files(void) { struct task_struct *task = current; struct files_struct *old, *copy = NULL; int error; error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) return error; old = task->files; task_lock(task); task->files = copy; task_unlock(task); put_files_struct(old); return 0; } int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; int threads = max_threads; int min = 1; int max = MAX_THREADS; t = *table; t.data = &threads; t.extra1 = &min; t.extra2 = &max; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || !write) return ret; max_threads = threads; return 0; } |
37 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* AF_RXRPC tracepoints * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rxrpc #if !defined(_TRACE_RXRPC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RXRPC_H #include <linux/tracepoint.h> #include <linux/errqueue.h> /* * Declare tracing information enums and their string mappings for display. */ #define rxrpc_abort_reasons \ /* AFS errors */ \ EM(afs_abort_general_error, "afs-error") \ EM(afs_abort_interrupted, "afs-intr") \ EM(afs_abort_oom, "afs-oom") \ EM(afs_abort_op_not_supported, "afs-op-notsupp") \ EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \ EM(afs_abort_send_data_error, "afs-send-data") \ EM(afs_abort_unmarshal_error, "afs-unmarshal") \ /* rxperf errors */ \ EM(rxperf_abort_general_error, "rxperf-error") \ EM(rxperf_abort_oom, "rxperf-oom") \ EM(rxperf_abort_op_not_supported, "rxperf-op-notsupp") \ EM(rxperf_abort_unmarshal_error, "rxperf-unmarshal") \ /* RxKAD security errors */ \ EM(rxkad_abort_1_short_check, "rxkad1-short-check") \ EM(rxkad_abort_1_short_data, "rxkad1-short-data") \ EM(rxkad_abort_1_short_encdata, "rxkad1-short-encdata") \ EM(rxkad_abort_1_short_header, "rxkad1-short-hdr") \ EM(rxkad_abort_2_short_check, "rxkad2-short-check") \ EM(rxkad_abort_2_short_data, "rxkad2-short-data") \ EM(rxkad_abort_2_short_header, "rxkad2-short-hdr") \ EM(rxkad_abort_2_short_len, "rxkad2-short-len") \ EM(rxkad_abort_bad_checksum, "rxkad2-bad-cksum") \ EM(rxkad_abort_chall_key_expired, "rxkad-chall-key-exp") \ EM(rxkad_abort_chall_level, "rxkad-chall-level") \ EM(rxkad_abort_chall_no_key, "rxkad-chall-nokey") \ EM(rxkad_abort_chall_short, "rxkad-chall-short") \ EM(rxkad_abort_chall_version, "rxkad-chall-version") \ EM(rxkad_abort_resp_bad_callid, "rxkad-resp-bad-callid") \ EM(rxkad_abort_resp_bad_checksum, "rxkad-resp-bad-cksum") \ EM(rxkad_abort_resp_bad_param, "rxkad-resp-bad-param") \ EM(rxkad_abort_resp_call_ctr, "rxkad-resp-call-ctr") \ EM(rxkad_abort_resp_call_state, "rxkad-resp-call-state") \ EM(rxkad_abort_resp_key_expired, "rxkad-resp-key-exp") \ EM(rxkad_abort_resp_key_rejected, "rxkad-resp-key-rej") \ EM(rxkad_abort_resp_level, "rxkad-resp-level") \ EM(rxkad_abort_resp_nokey, "rxkad-resp-nokey") \ EM(rxkad_abort_resp_ooseq, "rxkad-resp-ooseq") \ EM(rxkad_abort_resp_short, "rxkad-resp-short") \ EM(rxkad_abort_resp_short_tkt, "rxkad-resp-short-tkt") \ EM(rxkad_abort_resp_tkt_aname, "rxkad-resp-tk-aname") \ EM(rxkad_abort_resp_tkt_expired, "rxkad-resp-tk-exp") \ EM(rxkad_abort_resp_tkt_future, "rxkad-resp-tk-future") \ EM(rxkad_abort_resp_tkt_inst, "rxkad-resp-tk-inst") \ EM(rxkad_abort_resp_tkt_len, "rxkad-resp-tk-len") \ EM(rxkad_abort_resp_tkt_realm, "rxkad-resp-tk-realm") \ EM(rxkad_abort_resp_tkt_short, "rxkad-resp-tk-short") \ EM(rxkad_abort_resp_tkt_sinst, "rxkad-resp-tk-sinst") \ EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ /* rxrpc errors */ \ EM(rxrpc_abort_call_improper_term, "call-improper-term") \ EM(rxrpc_abort_call_reset, "call-reset") \ EM(rxrpc_abort_call_sendmsg, "call-sendmsg") \ EM(rxrpc_abort_call_sock_release, "call-sock-rel") \ EM(rxrpc_abort_call_sock_release_tba, "call-sock-rel-tba") \ EM(rxrpc_abort_call_timeout, "call-timeout") \ EM(rxrpc_abort_no_service_key, "no-serv-key") \ EM(rxrpc_abort_nomem, "nomem") \ EM(rxrpc_abort_service_not_offered, "serv-not-offered") \ EM(rxrpc_abort_shut_down, "shut-down") \ EM(rxrpc_abort_unsupported_security, "unsup-sec") \ EM(rxrpc_badmsg_bad_abort, "bad-abort") \ EM(rxrpc_badmsg_bad_jumbo, "bad-jumbo") \ EM(rxrpc_badmsg_short_ack, "short-ack") \ EM(rxrpc_badmsg_short_ack_info, "short-ack-info") \ EM(rxrpc_badmsg_short_hdr, "short-hdr") \ EM(rxrpc_badmsg_unsupported_packet, "unsup-pkt") \ EM(rxrpc_badmsg_zero_call, "zero-call") \ EM(rxrpc_badmsg_zero_seq, "zero-seq") \ EM(rxrpc_badmsg_zero_service, "zero-service") \ EM(rxrpc_eproto_ackr_outside_window, "ackr-out-win") \ EM(rxrpc_eproto_ackr_sack_overflow, "ackr-sack-over") \ EM(rxrpc_eproto_ackr_short_sack, "ackr-short-sack") \ EM(rxrpc_eproto_ackr_zero, "ackr-zero") \ EM(rxrpc_eproto_bad_upgrade, "bad-upgrade") \ EM(rxrpc_eproto_data_after_last, "data-after-last") \ EM(rxrpc_eproto_different_last, "diff-last") \ EM(rxrpc_eproto_early_reply, "early-reply") \ EM(rxrpc_eproto_improper_term, "improper-term") \ EM(rxrpc_eproto_no_client_call, "no-cl-call") \ EM(rxrpc_eproto_no_client_conn, "no-cl-conn") \ EM(rxrpc_eproto_no_service_call, "no-sv-call") \ EM(rxrpc_eproto_reupgrade, "re-upgrade") \ EM(rxrpc_eproto_rxnull_challenge, "rxnull-chall") \ EM(rxrpc_eproto_rxnull_response, "rxnull-resp") \ EM(rxrpc_eproto_tx_rot_last, "tx-rot-last") \ EM(rxrpc_eproto_unexpected_ack, "unex-ack") \ EM(rxrpc_eproto_unexpected_ackall, "unex-ackall") \ EM(rxrpc_eproto_unexpected_implicit_end, "unex-impl-end") \ EM(rxrpc_eproto_unexpected_reply, "unex-reply") \ EM(rxrpc_eproto_wrong_security, "wrong-sec") \ EM(rxrpc_recvmsg_excess_data, "recvmsg-excess") \ EM(rxrpc_recvmsg_short_data, "recvmsg-short") \ E_(rxrpc_sendmsg_late_send, "sendmsg-late") #define rxrpc_call_poke_traces \ EM(rxrpc_call_poke_abort, "Abort") \ EM(rxrpc_call_poke_complete, "Compl") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ EM(rxrpc_call_poke_start, "Start") \ EM(rxrpc_call_poke_timer, "Timer") \ E_(rxrpc_call_poke_timer_now, "Timer-now") #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ E_(rxrpc_skb_see_version, "SEE version ") #define rxrpc_local_traces \ EM(rxrpc_local_free, "FREE ") \ EM(rxrpc_local_get_call, "GET call ") \ EM(rxrpc_local_get_client_conn, "GET conn-cln") \ EM(rxrpc_local_get_for_use, "GET for-use ") \ EM(rxrpc_local_get_peer, "GET peer ") \ EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \ EM(rxrpc_local_new, "NEW ") \ EM(rxrpc_local_put_bind, "PUT bind ") \ EM(rxrpc_local_put_call, "PUT call ") \ EM(rxrpc_local_put_for_use, "PUT for-use ") \ EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \ EM(rxrpc_local_put_peer, "PUT peer ") \ EM(rxrpc_local_put_prealloc_peer, "PUT peer-pre") \ EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ EM(rxrpc_local_stop, "STOP ") \ EM(rxrpc_local_stopped, "STOPPED ") \ EM(rxrpc_local_unuse_bind, "UNU bind ") \ EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \ EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \ EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \ EM(rxrpc_local_use_conn_work, "USE conn-wrk") \ EM(rxrpc_local_use_lookup, "USE lookup ") \ E_(rxrpc_local_use_peer_keepalive, "USE peer-kpa") #define rxrpc_peer_traces \ EM(rxrpc_peer_free, "FREE ") \ EM(rxrpc_peer_get_accept, "GET accept ") \ EM(rxrpc_peer_get_bundle, "GET bundle ") \ EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ EM(rxrpc_peer_get_input, "GET input ") \ EM(rxrpc_peer_get_input_error, "GET inpt-err") \ EM(rxrpc_peer_get_keepalive, "GET keepaliv") \ EM(rxrpc_peer_get_lookup_client, "GET look-cln") \ EM(rxrpc_peer_get_service_conn, "GET srv-conn") \ EM(rxrpc_peer_new_client, "NEW client ") \ EM(rxrpc_peer_new_prealloc, "NEW prealloc") \ EM(rxrpc_peer_put_bundle, "PUT bundle ") \ EM(rxrpc_peer_put_call, "PUT call ") \ EM(rxrpc_peer_put_conn, "PUT conn ") \ EM(rxrpc_peer_put_input, "PUT input ") \ EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") #define rxrpc_bundle_traces \ EM(rxrpc_bundle_free, "FREE ") \ EM(rxrpc_bundle_get_client_call, "GET clt-call") \ EM(rxrpc_bundle_get_client_conn, "GET clt-conn") \ EM(rxrpc_bundle_get_service_conn, "GET svc-conn") \ EM(rxrpc_bundle_put_call, "PUT call ") \ EM(rxrpc_bundle_put_conn, "PUT conn ") \ EM(rxrpc_bundle_put_discard, "PUT discard ") \ E_(rxrpc_bundle_new, "NEW ") #define rxrpc_conn_traces \ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ EM(rxrpc_conn_get_call_input, "GET inp-call") \ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ EM(rxrpc_conn_new_client, "NEW client ") \ EM(rxrpc_conn_new_service, "NEW service ") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ EM(rxrpc_conn_put_unidle, "PUT unidle ") \ EM(rxrpc_conn_put_work, "PUT work ") \ EM(rxrpc_conn_queue_challenge, "QUE chall ") \ EM(rxrpc_conn_queue_retry_work, "QUE retry-wk") \ EM(rxrpc_conn_queue_rx_work, "QUE rx-work ") \ EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ E_(rxrpc_conn_see_work, "SEE work ") #define rxrpc_client_traces \ EM(rxrpc_client_activate_chans, "Activa") \ EM(rxrpc_client_alloc, "Alloc ") \ EM(rxrpc_client_chan_activate, "ChActv") \ EM(rxrpc_client_chan_disconnect, "ChDisc") \ EM(rxrpc_client_chan_pass, "ChPass") \ EM(rxrpc_client_cleanup, "Clean ") \ EM(rxrpc_client_discard, "Discar") \ EM(rxrpc_client_exposed, "Expose") \ EM(rxrpc_client_replace, "Replac") \ EM(rxrpc_client_queue_new_call, "Q-Call") \ EM(rxrpc_client_to_active, "->Actv") \ E_(rxrpc_client_to_idle, "->Idle") #define rxrpc_call_traces \ EM(rxrpc_call_get_io_thread, "GET iothread") \ EM(rxrpc_call_get_input, "GET input ") \ EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \ EM(rxrpc_call_get_notify_socket, "GET notify ") \ EM(rxrpc_call_get_poke, "GET poke ") \ EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ EM(rxrpc_call_get_userid, "GET user-id ") \ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ EM(rxrpc_call_put_discard_error, "PUT disc-err") \ EM(rxrpc_call_put_io_thread, "PUT iothread") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ EM(rxrpc_call_put_poke, "PUT poke ") \ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ EM(rxrpc_call_put_userid, "PUT user-id ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ E_(rxrpc_call_see_zap, "SEE zap ") #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ EM(rxrpc_txqueue_dequeue, "DEQ") \ EM(rxrpc_txqueue_end, "END") \ EM(rxrpc_txqueue_queue, "QUE") \ EM(rxrpc_txqueue_queue_last, "QLS") \ EM(rxrpc_txqueue_rotate, "ROT") \ EM(rxrpc_txqueue_rotate_last, "RLS") \ E_(rxrpc_txqueue_wait, "WAI") #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ EM(rxrpc_receive_front, "FRN") \ EM(rxrpc_receive_incoming, "INC") \ EM(rxrpc_receive_queue, "QUE") \ EM(rxrpc_receive_queue_last, "QLS") \ EM(rxrpc_receive_queue_oos, "QUO") \ EM(rxrpc_receive_queue_oos_last, "QOL") \ EM(rxrpc_receive_oos, "OOS") \ EM(rxrpc_receive_oos_last, "OSL") \ EM(rxrpc_receive_rotate, "ROT") \ E_(rxrpc_receive_rotate_last, "RLS") #define rxrpc_recvmsg_traces \ EM(rxrpc_recvmsg_cont, "CONT") \ EM(rxrpc_recvmsg_data_return, "DATA") \ EM(rxrpc_recvmsg_dequeue, "DEQU") \ EM(rxrpc_recvmsg_enter, "ENTR") \ EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ EM(rxrpc_recvmsg_to_be_accepted, "TBAC") \ EM(rxrpc_recvmsg_unqueue, "UNQU") \ E_(rxrpc_recvmsg_wait, "WAIT") #define rxrpc_rtt_tx_traces \ EM(rxrpc_rtt_tx_cancel, "CNCE") \ EM(rxrpc_rtt_tx_data, "DATA") \ EM(rxrpc_rtt_tx_no_slot, "FULL") \ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ EM(rxrpc_rtt_rx_cancel, "CNCL") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ EM(rxrpc_rtt_rx_ping_response, "PONG") \ E_(rxrpc_rtt_rx_requested_ack, "RACK") #define rxrpc_timer_traces \ EM(rxrpc_timer_begin, "Begin ") \ EM(rxrpc_timer_exp_ack, "ExpAck") \ EM(rxrpc_timer_exp_hard, "ExpHrd") \ EM(rxrpc_timer_exp_idle, "ExpIdl") \ EM(rxrpc_timer_exp_keepalive, "ExpKA ") \ EM(rxrpc_timer_exp_lost_ack, "ExpLoA") \ EM(rxrpc_timer_exp_normal, "ExpNml") \ EM(rxrpc_timer_exp_ping, "ExpPng") \ EM(rxrpc_timer_exp_resend, "ExpRsn") \ EM(rxrpc_timer_init_for_reply, "IniRpl") \ EM(rxrpc_timer_init_for_send_reply, "SndRpl") \ EM(rxrpc_timer_restart, "Restrt") \ EM(rxrpc_timer_set_for_ack, "SetAck") \ EM(rxrpc_timer_set_for_hard, "SetHrd") \ EM(rxrpc_timer_set_for_idle, "SetIdl") \ EM(rxrpc_timer_set_for_keepalive, "KeepAl") \ EM(rxrpc_timer_set_for_lost_ack, "SetLoA") \ EM(rxrpc_timer_set_for_normal, "SetNml") \ EM(rxrpc_timer_set_for_ping, "SetPng") \ EM(rxrpc_timer_set_for_resend, "SetRTx") \ E_(rxrpc_timer_set_for_send, "SetSnd") #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ EM(rxrpc_propose_ack_input_data, "DataIn ") \ EM(rxrpc_propose_ack_input_data_hole, "DataInH") \ EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ EM(rxrpc_propose_ack_ping_for_params, "Params ") \ EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ EM(rxrpc_propose_ack_processing_op, "ProcOp ") \ EM(rxrpc_propose_ack_respond_to_ack, "Rsp2Ack") \ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") #define rxrpc_congest_modes \ EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \ EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \ EM(RXRPC_CALL_PACKET_LOSS, "PktLoss ") \ E_(RXRPC_CALL_SLOW_START, "SlowStart") #define rxrpc_congest_changes \ EM(rxrpc_cong_begin_retransmission, " Retrans") \ EM(rxrpc_cong_cleared_nacks, " Cleared") \ EM(rxrpc_cong_new_low_nack, " NewLowN") \ EM(rxrpc_cong_no_change, " -") \ EM(rxrpc_cong_progress, " Progres") \ EM(rxrpc_cong_idle_reset, " IdleRes") \ EM(rxrpc_cong_retransmit_again, " ReTxAgn") \ EM(rxrpc_cong_rtt_window_end, " RttWinE") \ E_(rxrpc_cong_saw_nack, " SawNack") #define rxrpc_pkts \ EM(0, "?00") \ EM(RXRPC_PACKET_TYPE_DATA, "DATA") \ EM(RXRPC_PACKET_TYPE_ACK, "ACK") \ EM(RXRPC_PACKET_TYPE_BUSY, "BUSY") \ EM(RXRPC_PACKET_TYPE_ABORT, "ABORT") \ EM(RXRPC_PACKET_TYPE_ACKALL, "ACKALL") \ EM(RXRPC_PACKET_TYPE_CHALLENGE, "CHALL") \ EM(RXRPC_PACKET_TYPE_RESPONSE, "RESP") \ EM(RXRPC_PACKET_TYPE_DEBUG, "DEBUG") \ EM(9, "?09") \ EM(10, "?10") \ EM(11, "?11") \ EM(12, "?12") \ EM(RXRPC_PACKET_TYPE_VERSION, "VERSION") \ EM(14, "?14") \ E_(15, "?15") #define rxrpc_ack_names \ EM(0, "-0-") \ EM(RXRPC_ACK_REQUESTED, "REQ") \ EM(RXRPC_ACK_DUPLICATE, "DUP") \ EM(RXRPC_ACK_OUT_OF_SEQUENCE, "OOS") \ EM(RXRPC_ACK_EXCEEDS_WINDOW, "WIN") \ EM(RXRPC_ACK_NOSPACE, "MEM") \ EM(RXRPC_ACK_PING, "PNG") \ EM(RXRPC_ACK_PING_RESPONSE, "PNR") \ EM(RXRPC_ACK_DELAY, "DLY") \ EM(RXRPC_ACK_IDLE, "IDL") \ E_(RXRPC_ACK__INVALID, "-?-") #define rxrpc_sack_traces \ EM(rxrpc_sack_advance, "ADV") \ EM(rxrpc_sack_fill, "FIL") \ EM(rxrpc_sack_nack, "NAK") \ EM(rxrpc_sack_none, "---") \ E_(rxrpc_sack_oos, "OOS") #define rxrpc_completions \ EM(RXRPC_CALL_SUCCEEDED, "Succeeded") \ EM(RXRPC_CALL_REMOTELY_ABORTED, "RemoteAbort") \ EM(RXRPC_CALL_LOCALLY_ABORTED, "LocalAbort") \ EM(RXRPC_CALL_LOCAL_ERROR, "LocalError") \ E_(RXRPC_CALL_NETWORK_ERROR, "NetError") #define rxrpc_tx_points \ EM(rxrpc_tx_point_call_abort, "CallAbort") \ EM(rxrpc_tx_point_call_ack, "CallAck") \ EM(rxrpc_tx_point_call_data_frag, "CallDataFrag") \ EM(rxrpc_tx_point_call_data_nofrag, "CallDataNofrag") \ EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ EM(rxrpc_reqack_already_on, "ALREADY-ON") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ EM(rxrpc_reqack_retrans, "RETRANS ") \ EM(rxrpc_reqack_slow_start, "SLOW-START") \ E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") /* ---- Must update size of stat_why_req_ack[] if more are added! */ #define rxrpc_txbuf_traces \ EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") /* * Generate enums for tracing information. */ #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #undef EM #undef E_ #define EM(a, b) a, #define E_(a, b) a enum rxrpc_abort_reason { rxrpc_abort_reasons } __mode(byte); enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte); enum rxrpc_call_poke_trace { rxrpc_call_poke_traces } __mode(byte); enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte); enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte); enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ /* * Export enum symbols via userspace. */ #undef EM #undef E_ #ifndef RXRPC_TRACE_ONLY_DEFINE_ENUMS #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); rxrpc_abort_reasons; rxrpc_bundle_traces; rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; rxrpc_tx_points; rxrpc_txbuf_traces; rxrpc_txqueue_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that * will be printed in the output. */ #undef EM #undef E_ #define EM(a, b) { a, b }, #define E_(a, b) { a, b } TRACE_EVENT(rxrpc_local, TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op, int ref, int usage), TP_ARGS(local_debug_id, op, ref, usage), TP_STRUCT__entry( __field(unsigned int, local) __field(int, op) __field(int, ref) __field(int, usage) ), TP_fast_assign( __entry->local = local_debug_id; __entry->op = op; __entry->ref = ref; __entry->usage = usage; ), TP_printk("L=%08x %s r=%d u=%d", __entry->local, __print_symbolic(__entry->op, rxrpc_local_traces), __entry->ref, __entry->usage) ); TRACE_EVENT(rxrpc_peer, TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), TP_ARGS(peer_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, peer) __field(int, ref) __field(enum rxrpc_peer_trace, why) ), TP_fast_assign( __entry->peer = peer_debug_id; __entry->ref = ref; __entry->why = why; ), TP_printk("P=%08x %s r=%d", __entry->peer, __print_symbolic(__entry->why, rxrpc_peer_traces), __entry->ref) ); TRACE_EVENT(rxrpc_bundle, TP_PROTO(unsigned int bundle_debug_id, int ref, enum rxrpc_bundle_trace why), TP_ARGS(bundle_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, bundle) __field(int, ref) __field(int, why) ), TP_fast_assign( __entry->bundle = bundle_debug_id; __entry->ref = ref; __entry->why = why; ), TP_printk("CB=%08x %s r=%d", __entry->bundle, __print_symbolic(__entry->why, rxrpc_bundle_traces), __entry->ref) ); TRACE_EVENT(rxrpc_conn, TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why), TP_ARGS(conn_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, conn) __field(int, ref) __field(int, why) ), TP_fast_assign( __entry->conn = conn_debug_id; __entry->ref = ref; __entry->why = why; ), TP_printk("C=%08x %s r=%d", __entry->conn, __print_symbolic(__entry->why, rxrpc_conn_traces), __entry->ref) ); TRACE_EVENT(rxrpc_client, TP_PROTO(struct rxrpc_connection *conn, int channel, enum rxrpc_client_trace op), TP_ARGS(conn, channel, op), TP_STRUCT__entry( __field(unsigned int, conn) __field(u32, cid) __field(int, channel) __field(int, usage) __field(enum rxrpc_client_trace, op) ), TP_fast_assign( __entry->conn = conn ? conn->debug_id : 0; __entry->channel = channel; __entry->usage = conn ? refcount_read(&conn->ref) : -2; __entry->op = op; __entry->cid = conn ? conn->proto.cid : 0; ), TP_printk("C=%08x h=%2d %s i=%08x u=%d", __entry->conn, __entry->channel, __print_symbolic(__entry->op, rxrpc_client_traces), __entry->cid, __entry->usage) ); TRACE_EVENT(rxrpc_call, TP_PROTO(unsigned int call_debug_id, int ref, unsigned long aux, enum rxrpc_call_trace why), TP_ARGS(call_debug_id, ref, aux, why), TP_STRUCT__entry( __field(unsigned int, call) __field(int, ref) __field(int, why) __field(unsigned long, aux) ), TP_fast_assign( __entry->call = call_debug_id; __entry->ref = ref; __entry->why = why; __entry->aux = aux; ), TP_printk("c=%08x %s r=%d a=%lx", __entry->call, __print_symbolic(__entry->why, rxrpc_call_traces), __entry->ref, __entry->aux) ); TRACE_EVENT(rxrpc_skb, TP_PROTO(struct sk_buff *skb, int usage, int mod_count, enum rxrpc_skb_trace why), TP_ARGS(skb, usage, mod_count, why), TP_STRUCT__entry( __field(struct sk_buff *, skb) __field(int, usage) __field(int, mod_count) __field(enum rxrpc_skb_trace, why) ), TP_fast_assign( __entry->skb = skb; __entry->usage = usage; __entry->mod_count = mod_count; __entry->why = why; ), TP_printk("s=%p Rx %s u=%d m=%d", __entry->skb, __print_symbolic(__entry->why, rxrpc_skb_traces), __entry->usage, __entry->mod_count) ); TRACE_EVENT(rxrpc_rx_packet, TP_PROTO(struct rxrpc_skb_priv *sp), TP_ARGS(sp), TP_STRUCT__entry( __field_struct(struct rxrpc_host_header, hdr) ), TP_fast_assign( memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); ), TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s", __entry->hdr.epoch, __entry->hdr.cid, __entry->hdr.callNumber, __entry->hdr.serviceId, __entry->hdr.serial, __entry->hdr.seq, __entry->hdr.securityIndex, __entry->hdr.flags, __print_symbolic(__entry->hdr.type, rxrpc_pkts)) ); TRACE_EVENT(rxrpc_rx_done, TP_PROTO(int result, int abort_code), TP_ARGS(result, abort_code), TP_STRUCT__entry( __field(int, result) __field(int, abort_code) ), TP_fast_assign( __entry->result = result; __entry->abort_code = abort_code; ), TP_printk("r=%d a=%d", __entry->result, __entry->abort_code) ); TRACE_EVENT(rxrpc_abort, TP_PROTO(unsigned int call_nr, enum rxrpc_abort_reason why, u32 cid, u32 call_id, rxrpc_seq_t seq, int abort_code, int error), TP_ARGS(call_nr, why, cid, call_id, seq, abort_code, error), TP_STRUCT__entry( __field(unsigned int, call_nr) __field(enum rxrpc_abort_reason, why) __field(u32, cid) __field(u32, call_id) __field(rxrpc_seq_t, seq) __field(int, abort_code) __field(int, error) ), TP_fast_assign( __entry->call_nr = call_nr; __entry->why = why; __entry->cid = cid; __entry->call_id = call_id; __entry->abort_code = abort_code; __entry->error = error; __entry->seq = seq; ), TP_printk("c=%08x %08x:%08x s=%u a=%d e=%d %s", __entry->call_nr, __entry->cid, __entry->call_id, __entry->seq, __entry->abort_code, __entry->error, __print_symbolic(__entry->why, rxrpc_abort_reasons)) ); TRACE_EVENT(rxrpc_call_complete, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_call_completion, compl) __field(int, error) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->compl = call->completion; __entry->error = call->error; __entry->abort_code = call->abort_code; ), TP_printk("c=%08x %s r=%d ac=%d", __entry->call, __print_symbolic(__entry->compl, rxrpc_completions), __entry->error, __entry->abort_code) ); TRACE_EVENT(rxrpc_txqueue, TP_PROTO(struct rxrpc_call *call, enum rxrpc_txqueue_trace why), TP_ARGS(call, why), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_bottom) __field(rxrpc_seq_t, tx_top) __field(rxrpc_seq_t, tx_prepared) __field(int, tx_winsize) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; __entry->tx_prepared = call->tx_prepared; __entry->tx_winsize = call->tx_winsize; ), TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, __entry->acks_hard_ack, __entry->tx_top - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, __entry->tx_prepared - __entry->tx_bottom, __entry->tx_winsize) ); TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), TP_ARGS(call, seq, serial, flags), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) __field(u8, flags) ), TP_fast_assign( __entry->call = call; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; ), TP_printk("c=%08x DATA %08x q=%08x fl=%02x", __entry->call, __entry->serial, __entry->seq, __entry->flags) ); TRACE_EVENT(rxrpc_rx_ack, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, rxrpc_serial_t ack_serial, rxrpc_seq_t first, rxrpc_seq_t prev, u8 reason, u8 n_acks), TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(rxrpc_serial_t, ack_serial) __field(rxrpc_seq_t, first) __field(rxrpc_seq_t, prev) __field(u8, reason) __field(u8, n_acks) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = serial; __entry->ack_serial = ack_serial; __entry->first = first; __entry->prev = prev; __entry->reason = reason; __entry->n_acks = n_acks; ), TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_serial, __entry->first, __entry->prev, __entry->n_acks) ); TRACE_EVENT(rxrpc_rx_abort, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, u32 abort_code), TP_ARGS(call, serial, abort_code), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = serial; __entry->abort_code = abort_code; ), TP_printk("c=%08x ABORT %08x ac=%d", __entry->call, __entry->serial, __entry->abort_code) ); TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), TP_ARGS(conn, serial, version, nonce, min_level), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(u32, version) __field(u32, nonce) __field(u32, min_level) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = serial; __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; ), TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", __entry->conn, __entry->serial, __entry->version, __entry->nonce, __entry->min_level) ); TRACE_EVENT(rxrpc_rx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 kvno, u32 ticket_len), TP_ARGS(conn, serial, version, kvno, ticket_len), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(u32, version) __field(u32, kvno) __field(u32, ticket_len) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = serial; __entry->version = version; __entry->kvno = kvno; __entry->ticket_len = ticket_len; ), TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", __entry->conn, __entry->serial, __entry->version, __entry->kvno, __entry->ticket_len) ); TRACE_EVENT(rxrpc_rx_rwind_change, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, u32 rwind, bool wake), TP_ARGS(call, serial, rwind, wake), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(u32, rwind) __field(bool, wake) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = serial; __entry->rwind = rwind; __entry->wake = wake; ), TP_printk("c=%08x %08x rw=%u%s", __entry->call, __entry->serial, __entry->rwind, __entry->wake ? " wake" : "") ); TRACE_EVENT(rxrpc_tx_packet, TP_PROTO(unsigned int call_id, struct rxrpc_wire_header *whdr, enum rxrpc_tx_point where), TP_ARGS(call_id, whdr, where), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_tx_point, where) __field_struct(struct rxrpc_wire_header, whdr) ), TP_fast_assign( __entry->call = call_id; memcpy(&__entry->whdr, whdr, sizeof(__entry->whdr)); __entry->where = where; ), TP_printk("c=%08x %08x:%08x:%08x:%04x %08x %08x %02x %02x %s %s", __entry->call, ntohl(__entry->whdr.epoch), ntohl(__entry->whdr.cid), ntohl(__entry->whdr.callNumber), ntohs(__entry->whdr.serviceId), ntohl(__entry->whdr.serial), ntohl(__entry->whdr.seq), __entry->whdr.type, __entry->whdr.flags, __entry->whdr.type <= 15 ? __print_symbolic(__entry->whdr.type, rxrpc_pkts) : "?UNK", __print_symbolic(__entry->where, rxrpc_tx_points)) ); TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags, bool retrans, bool lose), TP_ARGS(call, seq, serial, flags, retrans, lose), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) __field(u32, cid) __field(u32, call_id) __field(u8, flags) __field(bool, retrans) __field(bool, lose) ), TP_fast_assign( __entry->call = call->debug_id; __entry->cid = call->cid; __entry->call_id = call->call_id; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; __entry->retrans = retrans; __entry->lose = lose; ), TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s%s", __entry->call, __entry->cid, __entry->call_id, __entry->serial, __entry->seq, __entry->flags, __entry->retrans ? " *RETRANS*" : "", __entry->lose ? " *LOSE*" : "") ); TRACE_EVENT(rxrpc_tx_ack, TP_PROTO(unsigned int call, rxrpc_serial_t serial, rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, u8 reason, u8 n_acks, u16 rwind), TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, ack_first) __field(rxrpc_serial_t, ack_serial) __field(u8, reason) __field(u8, n_acks) __field(u16, rwind) ), TP_fast_assign( __entry->call = call; __entry->serial = serial; __entry->ack_first = ack_first; __entry->ack_serial = ack_serial; __entry->reason = reason; __entry->n_acks = n_acks; __entry->rwind = rwind; ), TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_first, __entry->ack_serial, __entry->n_acks, __entry->rwind) ); TRACE_EVENT(rxrpc_receive, TP_PROTO(struct rxrpc_call *call, enum rxrpc_receive_trace why, rxrpc_serial_t serial, rxrpc_seq_t seq), TP_ARGS(call, why, serial, seq), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_receive_trace, why) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, seq) __field(rxrpc_seq_t, window) __field(rxrpc_seq_t, wtop) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->seq = seq; __entry->window = call->ackr_window; __entry->wtop = call->ackr_wtop; ), TP_printk("c=%08x %s r=%08x q=%08x w=%08x-%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_receive_traces), __entry->serial, __entry->seq, __entry->window, __entry->wtop) ); TRACE_EVENT(rxrpc_recvmsg, TP_PROTO(unsigned int call_debug_id, enum rxrpc_recvmsg_trace why, int ret), TP_ARGS(call_debug_id, why, ret), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_recvmsg_trace, why) __field(int, ret) ), TP_fast_assign( __entry->call = call_debug_id; __entry->why = why; __entry->ret = ret; ), TP_printk("c=%08x %s ret=%d", __entry->call, __print_symbolic(__entry->why, rxrpc_recvmsg_traces), __entry->ret) ); TRACE_EVENT(rxrpc_recvdata, TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, rxrpc_seq_t seq, unsigned int offset, unsigned int len, int ret), TP_ARGS(call, why, seq, offset, len, ret), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_recvmsg_trace, why) __field(rxrpc_seq_t, seq) __field(unsigned int, offset) __field(unsigned int, len) __field(int, ret) ), TP_fast_assign( __entry->call = call ? call->debug_id : 0; __entry->why = why; __entry->seq = seq; __entry->offset = offset; __entry->len = len; __entry->ret = ret; ), TP_printk("c=%08x %s q=%08x o=%u l=%u ret=%d", __entry->call, __print_symbolic(__entry->why, rxrpc_recvmsg_traces), __entry->seq, __entry->offset, __entry->len, __entry->ret) ); TRACE_EVENT(rxrpc_rtt_tx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why, int slot, rxrpc_serial_t send_serial), TP_ARGS(call, why, slot, send_serial), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_rtt_tx_trace, why) __field(int, slot) __field(rxrpc_serial_t, send_serial) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->slot = slot; __entry->send_serial = send_serial; ), TP_printk("c=%08x [%d] %s sr=%08x", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_tx_traces), __entry->send_serial) ); TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, u32 rtt, u32 rto), TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, rto), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_rtt_rx_trace, why) __field(int, slot) __field(rxrpc_serial_t, send_serial) __field(rxrpc_serial_t, resp_serial) __field(u32, rtt) __field(u32, rto) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->slot = slot; __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; __entry->rto = rto; ), TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u rto=%u", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, __entry->rtt, __entry->rto) ); TRACE_EVENT(rxrpc_timer, TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why, unsigned long now), TP_ARGS(call, why, now), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_timer_trace, why) __field(long, now) __field(long, ack_at) __field(long, ack_lost_at) __field(long, resend_at) __field(long, ping_at) __field(long, expect_rx_by) __field(long, expect_req_by) __field(long, expect_term_by) __field(long, timer) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->now = now; __entry->ack_at = call->delay_ack_at; __entry->ack_lost_at = call->ack_lost_at; __entry->resend_at = call->resend_at; __entry->expect_rx_by = call->expect_rx_by; __entry->expect_req_by = call->expect_req_by; __entry->expect_term_by = call->expect_term_by; __entry->timer = call->timer.expires; ), TP_printk("c=%08x %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld", __entry->call, __print_symbolic(__entry->why, rxrpc_timer_traces), __entry->ack_at - __entry->now, __entry->ack_lost_at - __entry->now, __entry->resend_at - __entry->now, __entry->expect_rx_by - __entry->now, __entry->expect_req_by - __entry->now, __entry->expect_term_by - __entry->now, __entry->timer - __entry->now) ); TRACE_EVENT(rxrpc_timer_expired, TP_PROTO(struct rxrpc_call *call, unsigned long now), TP_ARGS(call, now), TP_STRUCT__entry( __field(unsigned int, call) __field(long, now) __field(long, ack_at) __field(long, ack_lost_at) __field(long, resend_at) __field(long, ping_at) __field(long, expect_rx_by) __field(long, expect_req_by) __field(long, expect_term_by) __field(long, timer) ), TP_fast_assign( __entry->call = call->debug_id; __entry->now = now; __entry->ack_at = call->delay_ack_at; __entry->ack_lost_at = call->ack_lost_at; __entry->resend_at = call->resend_at; __entry->expect_rx_by = call->expect_rx_by; __entry->expect_req_by = call->expect_req_by; __entry->expect_term_by = call->expect_term_by; __entry->timer = call->timer.expires; ), TP_printk("c=%08x EXPIRED a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld", __entry->call, __entry->ack_at - __entry->now, __entry->ack_lost_at - __entry->now, __entry->resend_at - __entry->now, __entry->expect_rx_by - __entry->now, __entry->expect_req_by - __entry->now, __entry->expect_term_by - __entry->now, __entry->timer - __entry->now) ); TRACE_EVENT(rxrpc_rx_lose, TP_PROTO(struct rxrpc_skb_priv *sp), TP_ARGS(sp), TP_STRUCT__entry( __field_struct(struct rxrpc_host_header, hdr) ), TP_fast_assign( memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); ), TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*", __entry->hdr.epoch, __entry->hdr.cid, __entry->hdr.callNumber, __entry->hdr.serviceId, __entry->hdr.serial, __entry->hdr.seq, __entry->hdr.type, __entry->hdr.flags, __entry->hdr.type <= 15 ? __print_symbolic(__entry->hdr.type, rxrpc_pkts) : "?UNK") ); TRACE_EVENT(rxrpc_propose_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, u8 ack_reason, rxrpc_serial_t serial), TP_ARGS(call, why, ack_reason, serial), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_propose_ack_trace, why) __field(rxrpc_serial_t, serial) __field(u8, ack_reason) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; ), TP_printk("c=%08x %s %s r=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial) ); TRACE_EVENT(rxrpc_send_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, u8 ack_reason, rxrpc_serial_t serial), TP_ARGS(call, why, ack_reason, serial), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_propose_ack_trace, why) __field(rxrpc_serial_t, serial) __field(u8, ack_reason) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; ), TP_printk("c=%08x %s %s r=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial) ); TRACE_EVENT(rxrpc_drop_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, u8 ack_reason, rxrpc_serial_t serial, bool nobuf), TP_ARGS(call, why, ack_reason, serial, nobuf), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_propose_ack_trace, why) __field(rxrpc_serial_t, serial) __field(u8, ack_reason) __field(bool, nobuf) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; __entry->nobuf = nobuf; ), TP_printk("c=%08x %s %s r=%08x nbf=%u", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial, __entry->nobuf) ); TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, s64 expiry), TP_ARGS(call, seq, expiry), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(s64, expiry) ), TP_fast_assign( __entry->call = call->debug_id; __entry->seq = seq; __entry->expiry = expiry; ), TP_printk("c=%08x q=%x xp=%lld", __entry->call, __entry->seq, __entry->expiry) ); TRACE_EVENT(rxrpc_congest, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, rxrpc_serial_t ack_serial, enum rxrpc_congest_change change), TP_ARGS(call, summary, ack_serial, change), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_congest_change, change) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) __field(rxrpc_serial_t, ack_serial) __field_struct(struct rxrpc_ack_summary, sum) ), TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), __entry->sum.cwnd, __entry->sum.ssthresh, __entry->sum.nr_acks, __entry->sum.saw_nacks, __entry->sum.nr_new_acks, __entry->sum.nr_rot_new_acks, __entry->top - __entry->hard_ack, __entry->sum.cumulative_acks, __entry->sum.dup_acks, __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "", __print_symbolic(__entry->change, rxrpc_congest_changes), __entry->sum.retrans_timeo ? " rTxTo" : "") ); TRACE_EVENT(rxrpc_reset_cwnd, TP_PROTO(struct rxrpc_call *call, ktime_t now), TP_ARGS(call, now), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_congest_mode, mode) __field(unsigned short, cwnd) __field(unsigned short, extra) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prepared) __field(ktime_t, since_last_tx) __field(bool, has_data) ), TP_fast_assign( __entry->call = call->debug_id; __entry->mode = call->cong_mode; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; __entry->prepared = call->tx_prepared - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); __entry->has_data = !list_empty(&call->tx_sendmsg); ), TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", __entry->call, __entry->hard_ack, __print_symbolic(__entry->mode, rxrpc_congest_modes), __entry->cwnd, __entry->extra, __entry->prepared, ktime_to_ns(__entry->since_last_tx), __entry->has_data) ); TRACE_EVENT(rxrpc_disconnect_call, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->abort_code = call->abort_code; ), TP_printk("c=%08x ab=%08x", __entry->call, __entry->abort_code) ); TRACE_EVENT(rxrpc_improper_term, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->abort_code = call->abort_code; ), TP_printk("c=%08x ab=%08x", __entry->call, __entry->abort_code) ); TRACE_EVENT(rxrpc_connect_call, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned long, user_call_ID) __field(u32, cid) __field(u32, call_id) __field_struct(struct sockaddr_rxrpc, srx) ), TP_fast_assign( __entry->call = call->debug_id; __entry->user_call_ID = call->user_call_ID; __entry->cid = call->cid; __entry->call_id = call->call_id; __entry->srx = call->dest_srx; ), TP_printk("c=%08x u=%p %08x:%08x dst=%pISp", __entry->call, (void *)__entry->user_call_ID, __entry->cid, __entry->call_id, &__entry->srx.transport) ); TRACE_EVENT(rxrpc_resend, TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack), TP_ARGS(call, ack), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_seq_t, transmitted) __field(rxrpc_serial_t, ack_serial) ), TP_fast_assign( struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; __entry->ack_serial = sp ? sp->hdr.serial : 0; ), TP_printk("c=%08x r=%x q=%x tq=%x", __entry->call, __entry->ack_serial, __entry->seq, __entry->transmitted) ); TRACE_EVENT(rxrpc_rx_icmp, TP_PROTO(struct rxrpc_peer *peer, struct sock_extended_err *ee, struct sockaddr_rxrpc *srx), TP_ARGS(peer, ee, srx), TP_STRUCT__entry( __field(unsigned int, peer) __field_struct(struct sock_extended_err, ee) __field_struct(struct sockaddr_rxrpc, srx) ), TP_fast_assign( __entry->peer = peer->debug_id; memcpy(&__entry->ee, ee, sizeof(__entry->ee)); memcpy(&__entry->srx, srx, sizeof(__entry->srx)); ), TP_printk("P=%08x o=%u t=%u c=%u i=%u d=%u e=%d %pISp", __entry->peer, __entry->ee.ee_origin, __entry->ee.ee_type, __entry->ee.ee_code, __entry->ee.ee_info, __entry->ee.ee_data, __entry->ee.ee_errno, &__entry->srx.transport) ); TRACE_EVENT(rxrpc_tx_fail, TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, int ret, enum rxrpc_tx_point where), TP_ARGS(debug_id, serial, ret, where), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) __field(int, ret) __field(enum rxrpc_tx_point, where) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->serial = serial; __entry->ret = ret; __entry->where = where; ), TP_printk("c=%08x r=%x ret=%d %s", __entry->debug_id, __entry->serial, __entry->ret, __print_symbolic(__entry->where, rxrpc_tx_points)) ); TRACE_EVENT(rxrpc_call_reset, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(u32, cid) __field(u32, call_id) __field(rxrpc_serial_t, call_serial) __field(rxrpc_serial_t, conn_serial) __field(rxrpc_seq_t, tx_seq) __field(rxrpc_seq_t, rx_seq) ), TP_fast_assign( __entry->debug_id = call->debug_id; __entry->cid = call->cid; __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; __entry->tx_seq = call->acks_hard_ack; __entry->rx_seq = call->rx_highest_seq; ), TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x", __entry->debug_id, __entry->cid, __entry->call_id, __entry->call_serial, __entry->conn_serial, __entry->tx_seq, __entry->rx_seq) ); TRACE_EVENT(rxrpc_notify_socket, TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial), TP_ARGS(debug_id, serial), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->serial = serial; ), TP_printk("c=%08x r=%08x", __entry->debug_id, __entry->serial) ); TRACE_EVENT(rxrpc_rx_discard_ack, TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, prev_pkt, call_ackr_prev), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, first_soft_ack) __field(rxrpc_seq_t, call_ackr_first) __field(rxrpc_seq_t, prev_pkt) __field(rxrpc_seq_t, call_ackr_prev) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->serial = serial; __entry->first_soft_ack = first_soft_ack; __entry->call_ackr_first = call_ackr_first; __entry->prev_pkt = prev_pkt; __entry->call_ackr_prev = call_ackr_prev; ), TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", __entry->debug_id, __entry->serial, __entry->first_soft_ack, __entry->call_ackr_first, __entry->prev_pkt, __entry->call_ackr_prev) ); TRACE_EVENT(rxrpc_req_ack, TP_PROTO(unsigned int call_debug_id, rxrpc_seq_t seq, enum rxrpc_req_ack_trace why), TP_ARGS(call_debug_id, seq, why), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, seq) __field(enum rxrpc_req_ack_trace, why) ), TP_fast_assign( __entry->call_debug_id = call_debug_id; __entry->seq = seq; __entry->why = why; ), TP_printk("c=%08x q=%08x REQ-%s", __entry->call_debug_id, __entry->seq, __print_symbolic(__entry->why, rxrpc_req_ack_traces)) ); TRACE_EVENT(rxrpc_txbuf, TP_PROTO(unsigned int debug_id, unsigned int call_debug_id, rxrpc_seq_t seq, int ref, enum rxrpc_txbuf_trace what), TP_ARGS(debug_id, call_debug_id, seq, ref, what), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, seq) __field(int, ref) __field(enum rxrpc_txbuf_trace, what) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->call_debug_id = call_debug_id; __entry->seq = seq; __entry->ref = ref; __entry->what = what; ), TP_printk("B=%08x c=%08x q=%08x %s r=%d", __entry->debug_id, __entry->call_debug_id, __entry->seq, __print_symbolic(__entry->what, rxrpc_txbuf_traces), __entry->ref) ); TRACE_EVENT(rxrpc_poke_call, TP_PROTO(struct rxrpc_call *call, bool busy, enum rxrpc_call_poke_trace what), TP_ARGS(call, busy, what), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(bool, busy) __field(enum rxrpc_call_poke_trace, what) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; __entry->busy = busy; __entry->what = what; ), TP_printk("c=%08x %s%s", __entry->call_debug_id, __print_symbolic(__entry->what, rxrpc_call_poke_traces), __entry->busy ? "!" : "") ); TRACE_EVENT(rxrpc_call_poked, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call_debug_id) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; ), TP_printk("c=%08x", __entry->call_debug_id) ); TRACE_EVENT(rxrpc_sack, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, unsigned int sack, enum rxrpc_sack_trace what), TP_ARGS(call, seq, sack, what), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, seq) __field(unsigned int, sack) __field(enum rxrpc_sack_trace, what) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; __entry->seq = seq; __entry->sack = sack; __entry->what = what; ), TP_printk("c=%08x q=%08x %s k=%x", __entry->call_debug_id, __entry->seq, __print_symbolic(__entry->what, rxrpc_sack_traces), __entry->sack) ); #undef EM #undef E_ #endif /* RXRPC_TRACE_ONLY_DEFINE_ENUMS */ #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
137 2662 859 2925 2842 353 3053 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /* SPDX-License-Identifier: GPL-2.0 * * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. * Do not use in new code. */ #ifndef _BLK_CGROUP_RWSTAT_H #define _BLK_CGROUP_RWSTAT_H #include "blk-cgroup.h" enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, BLKG_RWSTAT_SYNC, BLKG_RWSTAT_ASYNC, BLKG_RWSTAT_DISCARD, BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, }; /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. */ struct blkg_rwstat { struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; struct blkg_rwstat_sample { u64 cnt[BLKG_RWSTAT_NR]; }; static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, unsigned int idx) { return atomic64_read(&rwstat->aux_cnt[idx]) + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); } int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); void blkg_rwstat_exit(struct blkg_rwstat *rwstat); u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat * @op: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, blk_opf_t opf, uint64_t val) { struct percpu_counter *cnt; if (op_is_discard(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; else if (op_is_write(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); if (op_is_sync(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, struct blkg_rwstat_sample *result) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) result->cnt[i] = percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); } /** * blkg_rwstat_total - read the total count of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Return the total count of @rwstat regardless of the IO direction. This * function can be called without synchronization and takes care of u64 * atomicity. */ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat_sample tmp = { }; blkg_rwstat_read(rwstat, &tmp); return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; } /** * blkg_rwstat_reset - reset a blkg_rwstat * @rwstat: blkg_rwstat to reset */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) { percpu_counter_set(&rwstat->cpu_cnt[i], 0); atomic64_set(&rwstat->aux_cnt[i], 0); } } /** * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, struct blkg_rwstat *from) { u64 sum[BLKG_RWSTAT_NR]; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); for (i = 0; i < BLKG_RWSTAT_NR; i++) atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } #endif /* _BLK_CGROUP_RWSTAT_H */ |
417 3051 2815 1417 1417 3051 1658 3051 4 3051 3051 3051 3051 3051 2976 2976 2976 2976 2976 2788 2788 2788 3022 3022 3022 2975 2788 2788 478 478 478 478 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/sched.h> #include <linux/random.h> #include <linux/sbitmap.h> #include <linux/seq_file.h> static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) { unsigned depth = sb->depth; sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags); if (!sb->alloc_hint) return -ENOMEM; if (depth && !sb->round_robin) { int i; for_each_possible_cpu(i) *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth); } return 0; } static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, unsigned int depth) { unsigned hint; hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { hint = depth ? get_random_u32_below(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); } return hint; } static inline void update_alloc_hint_after_get(struct sbitmap *sb, unsigned int depth, unsigned int hint, unsigned int nr) { if (nr == -1) { /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sb->alloc_hint, 0); } else if (nr == hint || unlikely(sb->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= depth - 1) hint = 0; this_cpu_write(*sb->alloc_hint, hint); } } /* * See if we have deferred clears that we can batch move */ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { unsigned long mask; if (!READ_ONCE(map->cleared)) return false; /* * First get a stable cleared mask, setting the old mask to 0. */ mask = xchg(&map->cleared, 0); /* * Now clear the masked bits in our free word */ atomic_long_andnot(mask, (atomic_long_t *)&map->word); BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); return true; } int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint) { unsigned int bits_per_word; if (shift < 0) shift = sbitmap_calculate_shift(depth); bits_per_word = 1U << shift; if (bits_per_word > BITS_PER_LONG) return -EINVAL; sb->shift = shift; sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); sb->round_robin = round_robin; if (depth == 0) { sb->map = NULL; return 0; } if (alloc_hint) { if (init_alloc_hint(sb, flags)) return -ENOMEM; } else { sb->alloc_hint = NULL; } sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); if (!sb->map) { free_percpu(sb->alloc_hint); return -ENOMEM; } return 0; } EXPORT_SYMBOL_GPL(sbitmap_init_node); void sbitmap_resize(struct sbitmap *sb, unsigned int depth) { unsigned int bits_per_word = 1U << sb->shift; unsigned int i; for (i = 0; i < sb->map_nr; i++) sbitmap_deferred_clear(&sb->map[i]); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); } EXPORT_SYMBOL_GPL(sbitmap_resize); static int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { int nr; /* don't wrap if starting from 0 */ wrap = wrap && hint; while (1) { nr = find_next_zero_bit(word, depth, hint); if (unlikely(nr >= depth)) { /* * We started with an offset, and we didn't reset the * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ if (hint && wrap) { hint = 0; continue; } return -1; } if (!test_and_set_bit_lock(nr, word)) break; hint = nr + 1; if (hint >= depth - 1) hint = 0; } return nr; } static int sbitmap_find_bit_in_word(struct sbitmap_word *map, unsigned int depth, unsigned int alloc_hint, bool wrap) { int nr; do { nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap); if (nr != -1) break; if (!sbitmap_deferred_clear(map)) break; } while (1); return nr; } static int sbitmap_find_bit(struct sbitmap *sb, unsigned int depth, unsigned int index, unsigned int alloc_hint, bool wrap) { unsigned int i; int nr = -1; for (i = 0; i < sb->map_nr; i++) { nr = sbitmap_find_bit_in_word(&sb->map[index], min_t(unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); if (nr != -1) { nr += index << sb->shift; break; } /* Jump to next index. */ alloc_hint = 0; if (++index >= sb->map_nr) index = 0; } return nr; } static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); /* * Unless we're doing round robin tag allocation, just use the * alloc_hint to find the right word index. No point in looping * twice in find_next_zero_bit() for that case. */ if (sb->round_robin) alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); else alloc_hint = 0; return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint, !sb->round_robin); } int sbitmap_get(struct sbitmap *sb) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get(sb, hint); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get); static int __sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); } int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get_shallow(sb, hint, shallow_depth); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get_shallow); bool sbitmap_any_bit_set(const struct sbitmap *sb) { unsigned int i; for (i = 0; i < sb->map_nr; i++) { if (sb->map[i].word & ~sb->map[i].cleared) return true; } return false; } EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; unsigned int word_depth = __map_depth(sb, i); if (set) weight += bitmap_weight(&word->word, word_depth); else weight += bitmap_weight(&word->cleared, word_depth); } return weight; } static unsigned int sbitmap_cleared(const struct sbitmap *sb) { return __sbitmap_weight(sb, false); } unsigned int sbitmap_weight(const struct sbitmap *sb) { return __sbitmap_weight(sb, true) - sbitmap_cleared(sb); } EXPORT_SYMBOL_GPL(sbitmap_weight); void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } EXPORT_SYMBOL_GPL(sbitmap_show); static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) { if ((offset & 0xf) == 0) { if (offset != 0) seq_putc(m, '\n'); seq_printf(m, "%08x:", offset); } if ((offset & 0x1) == 0) seq_putc(m, ' '); seq_printf(m, "%02x", byte); } void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) { u8 byte = 0; unsigned int byte_bits = 0; unsigned int offset = 0; int i; for (i = 0; i < sb->map_nr; i++) { unsigned long word = READ_ONCE(sb->map[i].word); unsigned long cleared = READ_ONCE(sb->map[i].cleared); unsigned int word_bits = __map_depth(sb, i); word &= ~cleared; while (word_bits > 0) { unsigned int bits = min(8 - byte_bits, word_bits); byte |= (word & (BIT(bits) - 1)) << byte_bits; byte_bits += bits; if (byte_bits == 8) { emit_byte(m, offset, byte); byte = 0; byte_bits = 0; offset++; } word >>= bits; word_bits -= bits; } } if (byte_bits) { emit_byte(m, offset, byte); offset++; } if (offset) seq_putc(m, '\n'); } EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; unsigned int shallow_depth; /* * For each batch, we wake up one queue. We need to make sure that our * batch size is small enough that the full depth of the bitmap, * potentially limited by a shallow depth, is enough to wake up all of * the queues. * * Each full word of the bitmap has bits_per_word bits, and there might * be a partial word. There are depth / bits_per_word full words and * depth % bits_per_word bits left over. In bitwise arithmetic: * * bits_per_word = 1 << shift * depth / bits_per_word = depth >> shift * depth % bits_per_word = depth & ((1 << shift) - 1) * * Each word can be limited to sbq->min_shallow_depth bits. */ shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); depth = ((depth >> sbq->sb.shift) * shallow_depth + min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); return wake_batch; } int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node) { int ret; int i; ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, round_robin, true); if (ret) return ret; sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); atomic_set(&sbq->completion_cnt, 0); atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { sbitmap_free(&sbq->sb); return -ENOMEM; } for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); if (sbq->wake_batch != wake_batch) WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users) { unsigned int wake_batch; unsigned int depth = (sbq->sb.depth + users - 1) / users; wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); int __sbitmap_queue_get(struct sbitmap_queue *sbq) { return sbitmap_get(&sbq->sb); } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset) { struct sbitmap *sb = &sbq->sb; unsigned int hint, depth; unsigned long index, nr; int i; if (unlikely(sb->round_robin)) return 0; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); index = SB_NR_TO_INDEX(sb, hint); for (i = 0; i < sb->map_nr; i++) { struct sbitmap_word *map = &sb->map[index]; unsigned long get_mask; unsigned int map_depth = __map_depth(sb, index); sbitmap_deferred_clear(map); if (map->word == (1UL << (map_depth - 1)) - 1) goto next; nr = find_first_zero_bit(&map->word, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; unsigned long val; get_mask = ((1UL << nr_tags) - 1) << nr; val = READ_ONCE(map->word); while (!atomic_long_try_cmpxchg(ptr, &val, get_mask | val)) ; get_mask = (get_mask & ~val) >> nr; if (get_mask) { *offset = nr + (index << sb->shift); update_alloc_hint_after_get(sb, depth, hint, *offset + nr_tags - 1); return get_mask; } } next: /* Jump to next index. */ if (++index >= sb->map_nr) index = 0; } return 0; } int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth) { WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); return sbitmap_get_shallow(&sbq->sb, shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow); void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth) { sbq->min_shallow_depth = min_shallow_depth; sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index, woken; if (!atomic_read(&sbq->ws_active)) return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; /* * Advance the index before checking the current queue. * It improves fairness, by ensuring the queue doesn't * need to be fully emptied before trying to wake up * from the next one. */ wake_index = sbq_index_inc(wake_index); if (waitqueue_active(&ws->wait)) { woken = wake_up_nr(&ws->wait, nr); if (woken == nr) break; nr -= woken; } } if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) return; atomic_add(nr, &sbq->completion_cnt); wakeups = atomic_read(&sbq->wakeup_cnt); do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) { if (likely(!sb->round_robin && tag < sb->depth)) data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag); } void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags) { struct sbitmap *sb = &sbq->sb; unsigned long *addr = NULL; unsigned long mask = 0; int i; smp_mb__before_atomic(); for (i = 0; i < nr_tags; i++) { const int tag = tags[i] - offset; unsigned long *this_addr; /* since we're clearing a batch, skip the deferred map */ this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; if (!addr) { addr = this_addr; } else if (addr != this_addr) { atomic_long_andnot(mask, (atomic_long_t *) addr); mask = 0; addr = this_addr; } mask |= (1UL << SB_NR_TO_BIT(sb, tag)); } if (mask) atomic_long_andnot(mask, (atomic_long_t *) addr); smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, nr_tags); sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), tags[nr_tags - 1] - offset); } void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { /* * Once the clear bit is set, the bit may be allocated out. * * Orders READ/WRITE on the associated instance(such as request * of blk_mq) by this bit for avoiding race with re-allocation, * and its pair is the memory barrier implied in __sbitmap_get_word. * * One invariant is that the clear bit has to be zero when the bit * is in use. */ smp_mb__before_atomic(); sbitmap_deferred_clear_bit(&sbq->sb, nr); /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the * waiter. See the comment on waitqueue_active(). */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, 1); sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) { int i, wake_index; /* * Pairs with the memory barrier in set_current_state() like in * sbitmap_queue_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; if (waitqueue_active(&ws->wait)) wake_up(&ws->wait); wake_index = sbq_index_inc(wake_index); } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) { bool first; int i; sbitmap_show(&sbq->sb, m); seq_puts(m, "alloc_hint={"); first = true; for_each_possible_cpu(i) { if (!first) seq_puts(m, ", "); first = false; seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i)); } seq_puts(m, "}\n"); seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { if (!sbq_wait->sbq) { sbq_wait->sbq = sbq; atomic_inc(&sbq->ws_active); add_wait_queue(&ws->wait, &sbq_wait->wait); } } EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) { list_del_init(&sbq_wait->wait.entry); if (sbq_wait->sbq) { atomic_dec(&sbq_wait->sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state) { if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active); sbq_wait->sbq = sbq; } prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); } EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { finish_wait(&ws->wait, &sbq_wait->wait); if (sbq_wait->sbq) { atomic_dec(&sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_finish_wait); |
43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference(fl->next); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_sk_fl_rcu(np, sfl) \ for (sfl = rcu_dereference(np->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (fl->opt && fl->share == IPV6_FL_S_EXCL) { struct ipv6_txoptions *opt = fl->opt; fl->opt = NULL; kfree(opt); } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock(); return fl; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(np->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) count++; rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; rcu_assign_pointer(np->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (np->repflow) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return 0; } } rcu_read_unlock(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!np->repflow) return -ESRCH; np->flow_label = 0; np->repflow = 0; return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock(); return err; } } rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; np->repflow = 1; return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(np, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(np, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } |
1488 1 17 17 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 4 3 3 2 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct canfd_frame and; struct canfd_frame or; struct canfd_frame xor; struct canfd_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct canfd_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); } static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); } static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); } static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) { memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } /* retrieve valid CC DLC value and store it into 'len' */ static void mod_retrieve_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* len8_dlc is only valid if len == CAN_MAX_DLEN */ if (ccf->len != CAN_MAX_DLEN) return; /* do we have a valid len8_dlc value from 9 .. 15 ? */ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) ccf->len = ccf->len8_dlc; } /* convert valid CC DLC value in 'len' into struct can_frame elements */ static void mod_store_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* clear potential leftovers */ ccf->len8_dlc = 0; /* plain data length 0 .. 8 - that was easy */ if (ccf->len <= CAN_MAX_DLEN) return; /* potentially broken values are caught in can_can_gw_rcv() */ if (ccf->len > CAN_MAX_RAW_DLC) return; /* we have a valid dlc value from 9 .. 15 in ccf->len */ ccf->len8_dlc = ccf->len; ccf->len = CAN_MAX_DLEN; } static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_and_len(cf, mod); mod_store_ccdlc(cf); } static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_or_len(cf, mod); mod_store_ccdlc(cf); } static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_xor_len(cf, mod); mod_store_ccdlc(cf); } static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_set_len(cf, mod); mod_store_ccdlc(cf); } static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 2 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->flags = src->flags; dst->len = src->len; memcpy(dst->data, src->data, CANFD_MAX_DLEN); } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) { s8 dlen = CAN_MAX_DLEN; if (r->flags & CGW_FLAGS_CAN_FD) dlen = CANFD_MAX_DLEN; /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr >= -dlen && fr < dlen && to >= -dlen && to < dlen && re >= -dlen && re < dlen) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_len) { if (idx < 0) return rx_len + idx; else return idx; } static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->len); int to = calc_idx(xor->to_idx, cf->len); int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * * The Controller Area Network controllers only accept CAN frames with * correct CRCs - which are not visible in the controller registers. * According to skbuff.h documentation the csum_start element for IP * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. * Only CAN skbs can be processed here which already have this property. */ #define cgw_hops(skb) ((skb)->csum_start) BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); if (cgw_hops(skb) >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ if (gwj->mod.modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && cgw_hops(nskb) == 1) cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->len > max_len) { /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } /* check for checksum updates */ if (gwj->mod.csumfunc.crc8) (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); kmem_cache_free(cgw_cache, gwj); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } if (gwj->mod.uid) { if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) goto cancel; } if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) goto cancel; } if (gwj->mod.csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &gwj->mod.csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX + 1]; struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (r->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (tb[CGW_FDMOD_AND]) { nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_and_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_and_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_fddata; } if (tb[CGW_FDMOD_OR]) { nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_or_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_or_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_fddata; } if (tb[CGW_FDMOD_XOR]) { nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_xor_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_xor_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_fddata; } if (tb[CGW_FDMOD_SET]) { nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_set_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_set_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_fddata; } } else { struct cgw_frame_mod mb; if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; if (mod.uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { if (gwj->mod.uid != mod.uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) return -EINVAL; /* update modifications with disabled softirq & quit */ local_bh_disable(); memcpy(&gwj->mod, &mod, sizeof(mod)); local_bh_enable(); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) return -ENODEV; gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ memcpy(&gwj->mod, &mod, sizeof(mod)); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; /* is sending the skb back to the incoming interface intended? */ if (gwj->src.dev == gwj->dst.dev && !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { err = -EINVAL; goto out; } ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) kmem_cache_free(cgw_cache, gwj); return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; /* we have a match when uid is enabled and identical */ if (gwj->mod.uid || mod.uid) { if (gwj->mod.uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit_batch = cangw_pernet_exit_batch, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway - max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(¬ifier); if (ret) goto out_register_notifier; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0); if (ret) goto out_rtnl_register1; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0); if (ret) goto out_rtnl_register2; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0); if (ret) goto out_rtnl_register3; return 0; out_rtnl_register3: rtnl_unregister(PF_CAN, RTM_NEWROUTE); out_rtnl_register2: rtnl_unregister(PF_CAN, RTM_GETROUTE); out_rtnl_register1: unregister_netdevice_notifier(¬ifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(¬ifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit); |
2 441 416 63 54 2 54 54 62 441 441 441 441 441 441 3 2 1 3 54 2 54 54 444 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; } |
19 19 19 19 19 19 19 19 19 19 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "send.h" #include "main.h" #include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/workqueue.h> #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" #include "log.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "soft-interface.h" #include "translation-table.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); /** * batadv_send_skb_packet() - send an already prepared packet * @skb: the packet to send * @hard_iface: the interface to use to send the broadcast packet * @dst_addr: the payload destination * * Send out an already prepared packet to the given neighbor or broadcast it * using the specified interface. Either hard_iface or neigh_node must be not * NULL. * If neigh_node is NULL, then the packet is broadcasted using hard_iface, * otherwise it is sent as unicast to the given neighbor. * * Regardless of the return value, the skb is consumed. * * Return: A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. */ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { struct batadv_priv *bat_priv; struct ethhdr *ethhdr; int ret; bat_priv = netdev_priv(hard_iface->soft_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; if (unlikely(!hard_iface->net_dev)) goto send_skb_err; if (!(hard_iface->net_dev->flags & IFF_UP)) { pr_warn("Interface %s is not up - can't send packet via that interface!\n", hard_iface->net_dev->name); goto send_skb_err; } /* push to the ethernet header. */ if (batadv_skb_head_push(skb, ETH_HLEN) < 0) goto send_skb_err; skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); ether_addr_copy(ethhdr->h_source, hard_iface->net_dev->dev_addr); ether_addr_copy(ethhdr->h_dest, dst_addr); ethhdr->h_proto = htons(ETH_P_BATMAN); skb_set_network_header(skb, ETH_HLEN); skb->protocol = htons(ETH_P_BATMAN); skb->dev = hard_iface->net_dev; /* Save a clone of the skb to use when decoding coded packets */ batadv_nc_skb_store_for_decoding(bat_priv, skb); /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. */ ret = dev_queue_xmit(skb); return net_xmit_eval(ret); send_skb_err: kfree_skb(skb); return NET_XMIT_DROP; } /** * batadv_send_broadcast_skb() - Send broadcast packet via hard interface * @skb: packet to be transmitted (with batadv header and no outer eth header) * @hard_iface: outgoing interface * * Return: A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. */ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); } /** * batadv_send_unicast_skb() - Send unicast packet to neighbor * @skb: packet to be transmitted (with batadv header and no outer eth header) * @neigh: neighbor which is used as next hop to destination * * Return: A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. */ int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh) { #ifdef CONFIG_BATMAN_ADV_BATMAN_V struct batadv_hardif_neigh_node *hardif_neigh; #endif int ret; ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr); #ifdef CONFIG_BATMAN_ADV_BATMAN_V hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; batadv_hardif_neigh_put(hardif_neigh); #endif return ret; } /** * batadv_send_skb_to_orig() - Lookup next-hop and transmit skb. * @skb: Packet to be transmitted. * @orig_node: Final destination of the packet. * @recv_if: Interface used when receiving the packet (can be NULL). * * Looks up the best next-hop towards the passed originator and passes the * skb on for preparation of MAC header. If the packet originated from this * host, NULL can be passed as recv_if and no interface alternating is * attempted. * * Return: negative errno code on a failure, -EINPROGRESS if the skb is * buffered for later transmit or the NET_XMIT status returned by the * lower routine if the packet has been passed down. */ int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if) { struct batadv_priv *bat_priv = orig_node->bat_priv; struct batadv_neigh_node *neigh_node; int ret; /* batadv_find_router() increases neigh_nodes refcount if found. */ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if); if (!neigh_node) { ret = -EINVAL; goto free_skb; } /* Check if the skb is too large to send in one piece and fragment * it if needed. */ if (atomic_read(&bat_priv->fragmentation) && skb->len > neigh_node->if_incoming->net_dev->mtu) { /* Fragment and send packet. */ ret = batadv_frag_send_packet(skb, orig_node, neigh_node); /* skb was consumed */ skb = NULL; goto put_neigh_node; } /* try to network code the packet, if it is received on an interface * (i.e. being forwarded). If the packet originates from this node or if * network coding fails, then send the packet as usual. */ if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) ret = -EINPROGRESS; else ret = batadv_send_unicast_skb(skb, neigh_node); /* skb was consumed */ skb = NULL; put_neigh_node: batadv_neigh_node_put(neigh_node); free_skb: kfree_skb(skb); return ret; } /** * batadv_send_skb_push_fill_unicast() - extend the buffer and initialize the * common fields for unicast packets * @skb: the skb carrying the unicast header to initialize * @hdr_size: amount of bytes to push at the beginning of the skb * @orig_node: the destination node * * Return: false if the buffer extension was not possible or true otherwise. */ static bool batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_unicast_packet *unicast_packet; u8 ttvn = (u8)atomic_read(&orig_node->last_ttvn); if (batadv_skb_head_push(skb, hdr_size) < 0) return false; unicast_packet = (struct batadv_unicast_packet *)skb->data; unicast_packet->version = BATADV_COMPAT_VERSION; /* batman packet type: unicast */ unicast_packet->packet_type = BATADV_UNICAST; /* set unicast ttl */ unicast_packet->ttl = BATADV_TTL; /* copy the destination for faster routing */ ether_addr_copy(unicast_packet->dest, orig_node->orig); /* set the destination tt version number */ unicast_packet->ttvn = ttvn; return true; } /** * batadv_send_skb_prepare_unicast() - encapsulate an skb with a unicast header * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * * Return: false if the payload could not be encapsulated or true otherwise. */ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, struct batadv_orig_node *orig_node) { size_t uni_size = sizeof(struct batadv_unicast_packet); return batadv_send_skb_push_fill_unicast(skb, uni_size, orig_node); } /** * batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a * unicast 4addr header * @bat_priv: the bat priv with all the soft interface information * @skb: the skb containing the payload to encapsulate * @orig: the destination node * @packet_subtype: the unicast 4addr packet subtype to use * * Return: false if the payload could not be encapsulated or true otherwise. */ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig, int packet_subtype) { struct batadv_hard_iface *primary_if; struct batadv_unicast_4addr_packet *uc_4addr_packet; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* Pull the header space and fill the unicast_packet substructure. * We can do that because the first member of the uc_4addr_packet * is of type struct unicast_packet */ if (!batadv_send_skb_push_fill_unicast(skb, sizeof(*uc_4addr_packet), orig)) goto out; uc_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; uc_4addr_packet->u.packet_type = BATADV_UNICAST_4ADDR; ether_addr_copy(uc_4addr_packet->src, primary_if->net_dev->dev_addr); uc_4addr_packet->subtype = packet_subtype; uc_4addr_packet->reserved = 0; ret = true; out: batadv_hardif_put(primary_if); return ret; } /** * batadv_send_skb_unicast() - encapsulate and send an skb via unicast * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast * 4addr packets) * @orig_node: the originator to send the packet to * @vid: the vid to be used to search the translation table * * Wrap the given skb into a batman-adv unicast or unicast-4addr header * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied * as packet_type. Then send this frame to the given orig_node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, int packet_subtype, struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr; int ret = NET_XMIT_DROP; if (!orig_node) goto out; switch (packet_type) { case BATADV_UNICAST: if (!batadv_send_skb_prepare_unicast(skb, orig_node)) goto out; break; case BATADV_UNICAST_4ADDR: if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, skb, orig_node, packet_subtype)) goto out; break; default: /* this function supports UNICAST and UNICAST_4ADDR only. It * should never be invoked with any other packet type */ goto out; } /* skb->data might have been reallocated by * batadv_send_skb_prepare_unicast{,_4addr}() */ ethhdr = eth_hdr(skb); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route * for this client. The destination will receive this packet and will * try to reroute it because the ttvn contained in the header is less * than the current one */ if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) unicast_packet->ttvn = unicast_packet->ttvn - 1; ret = batadv_send_skb_to_orig(skb, orig_node, NULL); /* skb was consumed */ skb = NULL; out: kfree_skb(skb); return ret; } /** * batadv_send_skb_via_tt_generic() - send an skb via TT lookup * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast * 4addr packets) * @dst_hint: can be used to override the destination contained in the skb * @vid: the vid to be used to search the translation table * * Look up the recipient node for the destination address in the ethernet * header via the translation table. Wrap the given skb into a batman-adv * unicast or unicast-4addr header depending on whether BATADV_UNICAST or * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame * to the according destination node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, int packet_subtype, u8 *dst_hint, unsigned short vid) { struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; u8 *src, *dst; int ret; src = ethhdr->h_source; dst = ethhdr->h_dest; /* if we got an hint! let's send the packet to this client (if any) */ if (dst_hint) { src = NULL; dst = dst_hint; } orig_node = batadv_transtable_search(bat_priv, src, dst, vid); ret = batadv_send_skb_unicast(bat_priv, skb, packet_type, packet_subtype, orig_node, vid); batadv_orig_node_put(orig_node); return ret; } /** * batadv_send_skb_via_gw() - send an skb via gateway lookup * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @vid: the vid to be used to search the translation table * * Look up the currently selected gateway. Wrap the given skb into a batman-adv * unicast header and send this frame to this gateway node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret; orig_node = batadv_gw_get_selected_orig(bat_priv); ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, BATADV_P_DATA, orig_node, vid); batadv_orig_node_put(orig_node); return ret; } /** * batadv_forw_packet_free() - free a forwarding packet * @forw_packet: The packet to free * @dropped: whether the packet is freed because is dropped * * This frees a forwarding packet and releases any resources it might * have claimed. */ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, bool dropped) { if (dropped) kfree_skb(forw_packet->skb); else consume_skb(forw_packet->skb); batadv_hardif_put(forw_packet->if_incoming); batadv_hardif_put(forw_packet->if_outgoing); if (forw_packet->queue_left) atomic_inc(forw_packet->queue_left); kfree(forw_packet); } /** * batadv_forw_packet_alloc() - allocate a forwarding packet * @if_incoming: The (optional) if_incoming to be grabbed * @if_outgoing: The (optional) if_outgoing to be grabbed * @queue_left: The (optional) queue counter to decrease * @bat_priv: The bat_priv for the mesh of this forw_packet * @skb: The raw packet this forwarding packet shall contain * * Allocates a forwarding packet and tries to get a reference to the * (optional) if_incoming, if_outgoing and queue_left. If queue_left * is NULL then bat_priv is optional, too. * * Return: An allocated forwarding packet on success, NULL otherwise. */ struct batadv_forw_packet * batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, atomic_t *queue_left, struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_forw_packet *forw_packet; const char *qname; if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) { qname = "unknown"; if (queue_left == &bat_priv->bcast_queue_left) qname = "bcast"; if (queue_left == &bat_priv->batman_queue_left) qname = "batman"; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s queue is full\n", qname); return NULL; } forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); if (!forw_packet) goto err; if (if_incoming) kref_get(&if_incoming->refcount); if (if_outgoing) kref_get(&if_outgoing->refcount); INIT_HLIST_NODE(&forw_packet->list); INIT_HLIST_NODE(&forw_packet->cleanup_list); forw_packet->skb = skb; forw_packet->queue_left = queue_left; forw_packet->if_incoming = if_incoming; forw_packet->if_outgoing = if_outgoing; forw_packet->num_packets = 0; return forw_packet; err: if (queue_left) atomic_inc(queue_left); return NULL; } /** * batadv_forw_packet_was_stolen() - check whether someone stole this packet * @forw_packet: the forwarding packet to check * * This function checks whether the given forwarding packet was claimed by * someone else for free(). * * Return: True if someone stole it, false otherwise. */ static bool batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet) { return !hlist_unhashed(&forw_packet->cleanup_list); } /** * batadv_forw_packet_steal() - claim a forw_packet for free() * @forw_packet: the forwarding packet to steal * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock) * * This function tries to steal a specific forw_packet from global * visibility for the purpose of getting it for free(). That means * the caller is *not* allowed to requeue it afterwards. * * Return: True if stealing was successful. False if someone else stole it * before us. */ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, spinlock_t *lock) { /* did purging routine steal it earlier? */ spin_lock_bh(lock); if (batadv_forw_packet_was_stolen(forw_packet)) { spin_unlock_bh(lock); return false; } hlist_del_init(&forw_packet->list); /* Just to spot misuse of this function */ hlist_add_fake(&forw_packet->cleanup_list); spin_unlock_bh(lock); return true; } /** * batadv_forw_packet_list_steal() - claim a list of forward packets for free() * @forw_list: the to be stolen forward packets * @cleanup_list: a backup pointer, to be able to dispose the packet later * @hard_iface: the interface to steal forward packets from * * This function claims responsibility to free any forw_packet queued on the * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * * The packets are being moved from the forw_list to the cleanup_list. This * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, struct hlist_head *cleanup_list, const struct batadv_hard_iface *hard_iface) { struct batadv_forw_packet *forw_packet; struct hlist_node *safe_tmp_node; hlist_for_each_entry_safe(forw_packet, safe_tmp_node, forw_list, list) { /* if purge_outstanding_packets() was called with an argument * we delete only packets belonging to the given interface */ if (hard_iface && forw_packet->if_incoming != hard_iface && forw_packet->if_outgoing != hard_iface) continue; hlist_del(&forw_packet->list); hlist_add_head(&forw_packet->cleanup_list, cleanup_list); } } /** * batadv_forw_packet_list_free() - free a list of forward packets * @head: a list of to be freed forw_packets * * This function cancels the scheduling of any packet in the provided list, * waits for any possibly running packet forwarding thread to finish and * finally, safely frees this forward packet. * * This function might sleep. */ static void batadv_forw_packet_list_free(struct hlist_head *head) { struct batadv_forw_packet *forw_packet; struct hlist_node *safe_tmp_node; hlist_for_each_entry_safe(forw_packet, safe_tmp_node, head, cleanup_list) { cancel_delayed_work_sync(&forw_packet->delayed_work); hlist_del(&forw_packet->cleanup_list); batadv_forw_packet_free(forw_packet, true); } } /** * batadv_forw_packet_queue() - try to queue a forwarding packet * @forw_packet: the forwarding packet to queue * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock) * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list) * @send_time: timestamp (jiffies) when the packet is to be sent * * This function tries to (re)queue a forwarding packet. Requeuing * is prevented if the according interface is shutting down * (e.g. if batadv_forw_packet_list_steal() was called for this * packet earlier). * * Calling batadv_forw_packet_queue() after a call to * batadv_forw_packet_steal() is forbidden! * * Caller needs to ensure that forw_packet->delayed_work was initialized. */ static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet, spinlock_t *lock, struct hlist_head *head, unsigned long send_time) { spin_lock_bh(lock); /* did purging routine steal it from us? */ if (batadv_forw_packet_was_stolen(forw_packet)) { /* If you got it for free() without trouble, then * don't get back into the queue after stealing... */ WARN_ONCE(hlist_fake(&forw_packet->cleanup_list), "Requeuing after batadv_forw_packet_steal() not allowed!\n"); spin_unlock_bh(lock); return; } hlist_del_init(&forw_packet->list); hlist_add_head(&forw_packet->list, head); queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work, send_time - jiffies); spin_unlock_bh(lock); } /** * batadv_forw_packet_bcast_queue() - try to queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent * * This function tries to (re)queue a broadcast packet. * * Caller needs to ensure that forw_packet->delayed_work was initialized. */ static void batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, unsigned long send_time) { batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bcast_list_lock, &bat_priv->forw_bcast_list, send_time); } /** * batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent * * This function tries to (re)queue an OGMv1 packet. * * Caller needs to ensure that forw_packet->delayed_work was initialized. */ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, unsigned long send_time) { batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bat_list_lock, &bat_priv->forw_bat_list, send_time); } /** * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * @if_in: the interface where the packet was received on * @if_out: the outgoing interface to queue on * * Adds a broadcast packet to the queue and sets up timers. Broadcast packets * are sent multiple times to increase probability for being received. * * This call clones the given skb, hence the caller needs to take into * account that the data segment of the original skb might not be * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet, struct batadv_hard_iface *if_in, struct batadv_hard_iface *if_out) { struct batadv_forw_packet *forw_packet; unsigned long send_time = jiffies; struct sk_buff *newskb; newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) goto err; forw_packet = batadv_forw_packet_alloc(if_in, if_out, &bat_priv->bcast_queue_left, bat_priv, newskb); if (!forw_packet) goto err_packet_free; forw_packet->own = own_packet; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); send_time += delay ? delay : msecs_to_jiffies(5); batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return NETDEV_TX_OK; err_packet_free: kfree_skb(newskb); err: return NETDEV_TX_BUSY; } /** * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * @if_in: the interface where the packet was received on * @if_out: the outgoing interface to forward to * * Transmits a broadcast packet on the specified interface either immediately * or if a delay is given after that. Furthermore, queues additional * retransmissions if this interface is a wireless one. * * This call clones the given skb, hence the caller needs to take into * account that the data segment of the original skb might not be * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet, struct batadv_hard_iface *if_in, struct batadv_hard_iface *if_out) { unsigned int num_bcasts = if_out->num_bcasts; struct sk_buff *newskb; int ret = NETDEV_TX_OK; if (!delay) { newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) return NETDEV_TX_BUSY; batadv_send_broadcast_skb(newskb, if_out); num_bcasts--; } /* delayed broadcast or rebroadcasts? */ if (num_bcasts >= 1) { BATADV_SKB_CB(skb)->num_bcasts = num_bcasts; ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay, own_packet, if_in, if_out); } return ret; } /** * batadv_send_no_broadcast() - check whether (re)broadcast is necessary * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to check * @own_packet: true if it is a self-generated broadcast packet * @if_out: the outgoing interface checked and considered for (re)broadcast * * Return: False if a packet needs to be (re)broadcasted on the given interface, * true otherwise. */ static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, struct sk_buff *skb, bool own_packet, struct batadv_hard_iface *if_out) { struct batadv_hardif_neigh_node *neigh_node = NULL; struct batadv_bcast_packet *bcast_packet; u8 *orig_neigh; u8 *neigh_addr; char *type; int ret; if (!own_packet) { neigh_addr = eth_hdr(skb)->h_source; neigh_node = batadv_hardif_neigh_get(if_out, neigh_addr); } bcast_packet = (struct batadv_bcast_packet *)skb->data; orig_neigh = neigh_node ? neigh_node->orig : NULL; ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig, orig_neigh); batadv_hardif_neigh_put(neigh_node); /* ok, may broadcast */ if (!ret) return false; /* no broadcast */ switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s suppressed: %s\n", bcast_packet->orig, if_out->net_dev->name, type); return true; } /** * __batadv_forw_bcast_packet() - forward and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * * Transmits a broadcast packet either immediately or if a delay is given * after that. Furthermore, queues additional retransmissions on wireless * interfaces. * * This call clones the given skb, hence the caller needs to take into * account that the data segment of the given skb might not be * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet) { struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if; int ret = NETDEV_TX_OK; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return NETDEV_TX_BUSY; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; if (batadv_send_no_broadcast(bat_priv, skb, own_packet, hard_iface)) { batadv_hardif_put(hard_iface); continue; } ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay, own_packet, primary_if, hard_iface); batadv_hardif_put(hard_iface); if (ret == NETDEV_TX_BUSY) break; } rcu_read_unlock(); batadv_hardif_put(primary_if); return ret; } /** * batadv_forw_bcast_packet() - forward and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * * Transmits a broadcast packet either immediately or if a delay is given * after that. Furthermore, queues additional retransmissions on wireless * interfaces. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet) { return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); } /** * batadv_send_bcast_packet() - send and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * * Transmits a broadcast packet either immediately or if a delay is given * after that. Furthermore, queues additional retransmissions on wireless * interfaces. * * Consumes the provided skb. */ void batadv_send_bcast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet) { __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); consume_skb(skb); } /** * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary * @forw_packet: the forwarding packet to check * * Checks whether a given packet has any (re)transmissions left on the provided * interface. * * hard_iface may be NULL: In that case the number of transmissions this skb had * so far is compared with the maximum amount of retransmissions independent of * any interface instead. * * Return: True if (re)transmissions are left, false otherwise. */ static bool batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet) { return BATADV_SKB_CB(forw_packet->skb)->num_bcasts; } /** * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a * packet * @forw_packet: the packet to decrease the counter for */ static void batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet) { BATADV_SKB_CB(forw_packet->skb)->num_bcasts--; } /** * batadv_forw_packet_is_rebroadcast() - check packet for previous transmissions * @forw_packet: the packet to check * * Return: True if this packet was transmitted before, false otherwise. */ bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet) { unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts; return num_bcasts != forw_packet->if_outgoing->num_bcasts; } /** * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet * @work: work queue item * * Transmits a queued broadcast packet and if necessary reschedules it. */ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) { unsigned long send_time = jiffies + msecs_to_jiffies(5); struct batadv_forw_packet *forw_packet; struct delayed_work *delayed_work; struct batadv_priv *bat_priv; struct sk_buff *skb1; bool dropped = false; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; goto out; } if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) { dropped = true; goto out; } /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (!skb1) goto out; batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing); batadv_forw_packet_bcasts_dec(forw_packet); if (batadv_forw_packet_bcasts_left(forw_packet)) { batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return; } out: /* do we get something for free()? */ if (batadv_forw_packet_steal(forw_packet, &bat_priv->forw_bcast_list_lock)) batadv_forw_packet_free(forw_packet, dropped); } /** * batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets * @bat_priv: the bat priv with all the soft interface information * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on * * This method cancels and purges any broadcast and OGMv1 packet on the given * hard_iface. If hard_iface is NULL, broadcast and OGMv1 packets on all hard * interfaces will be canceled and purged. * * This function might sleep. */ void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface) { struct hlist_head head = HLIST_HEAD_INIT; if (hard_iface) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s(): %s\n", __func__, hard_iface->net_dev->name); else batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s()\n", __func__); /* claim bcast list for free() */ spin_lock_bh(&bat_priv->forw_bcast_list_lock); batadv_forw_packet_list_steal(&bat_priv->forw_bcast_list, &head, hard_iface); spin_unlock_bh(&bat_priv->forw_bcast_list_lock); /* claim batman packet list for free() */ spin_lock_bh(&bat_priv->forw_bat_list_lock); batadv_forw_packet_list_steal(&bat_priv->forw_bat_list, &head, hard_iface); spin_unlock_bh(&bat_priv->forw_bat_list_lock); /* then cancel or wait for packet workers to finish and free */ batadv_forw_packet_list_free(&head); } |
56 441 441 437 441 441 441 437 437 416 416 62 62 62 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * This file contains sctp stream maniuplation primitives and helpers. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) { struct sctp_association *asoc; struct sctp_chunk *ch, *temp; struct sctp_outq *outq; asoc = container_of(stream, struct sctp_association, stream); outq = &asoc->outqueue; list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (sid < outcnt) continue; sctp_sched_dequeue_common(outq, ch); /* No need to call dequeue_done here because * the chunks are not scheduled by now. */ /* Mark as failed send. */ sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(ch); } } static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; sched = sctp_sched_ops_from_stream(stream); sched->free_sid(stream, sid); kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. */ static void sctp_stream_outq_migrate(struct sctp_stream *stream, struct sctp_stream *new, __u16 outcnt) { int i; if (stream->outcnt > outcnt) sctp_stream_shrink_out(stream, outcnt); if (new) { /* Here we actually move the old ext stuff into the new * buffer, because we want to keep it. Then * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { int ret; if (outcnt <= stream->outcnt) goto out; ret = genradix_prealloc(&stream->out, outcnt, gfp); if (ret) return ret; out: stream->outcnt = outcnt; return 0; } static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { int ret; if (incnt <= stream->incnt) goto out; ret = genradix_prealloc(&stream->in, incnt, gfp); if (ret) return ret; out: stream->incnt = incnt; return 0; } int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; handle_in: sctp_stream_interleave_init(stream); if (!incnt) return 0; return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); if (ret) { kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } return ret; } void sctp_stream_free(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } void sctp_stream_clear(struct sctp_stream *stream) { int i; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; stream->in = new->in; stream->outcnt = new->outcnt; stream->incnt = new->incnt; sched->sched_all(stream); new->out.tree.root = NULL; new->in.tree.root = NULL; new->outcnt = 0; new->incnt = 0; } static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); return retval; } static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, __u16 str_nums, __be16 *str_list) { struct sctp_association *asoc; __u16 i; asoc = container_of(stream, struct sctp_association, stream); if (!asoc->outqueue.out_qlen) return true; if (!str_nums) return false; for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); if (SCTP_SO(stream, sid)->ext && !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } return true; } int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { struct sctp_stream *stream = &asoc->stream; __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING; in = params->srs_flags & SCTP_STREAM_RESET_INCOMING; if (!out && !in) goto out; str_nums = params->srs_number_streams; str_list = params->srs_stream_list; if (str_nums) { int param_len = 0; if (out) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->outcnt) goto out; param_len = str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_outreq); } if (in) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->incnt) goto out; param_len += str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_inreq); } if (param_len > SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_reconf_chunk)) goto out; } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { retval = -ENOMEM; goto out; } for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { kfree(nstr_list); retval = -EAGAIN; goto out; } chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); if (!chunk) { retval = -ENOMEM; goto out; } if (out) { if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; if (!out) goto out; if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } asoc->strreset_outstanding = out + in; out: return retval; } int sctp_send_reset_assoc(struct sctp_association *asoc) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u16 i; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) return -ENOPROTOOPT; if (asoc->strreset_outstanding) return -EINPROGRESS; if (!sctp_outq_is_empty(&asoc->outqueue)) return -EAGAIN; chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } asoc->strreset_outstanding = 1; return 0; } int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u32 outcnt, incnt; __u16 out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->sas_outstrms; in = params->sas_instrms; outcnt = stream->outcnt + out; incnt = stream->incnt + in; if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM || (!out && !in)) { retval = -EINVAL; goto out; } if (out) { retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); if (retval) goto out; } chunk = sctp_make_strreset_addstrm(asoc, out, in); if (!chunk) { retval = -ENOMEM; goto out; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; goto out; } asoc->strreset_outstanding = !!out + !!in; out: return retval; } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; struct sctp_reconf_chunk *hdr; union sctp_params param; if (!chunk) return NULL; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { /* sctp_strreset_tsnreq is actually the basic structure * of all stream reconf params, so it's safe to use it * to access request_seq. */ struct sctp_strreset_tsnreq *req = param.v; if ((!resp_seq || req->request_seq == resp_seq) && (!type || type == req->param_hdr.type)) return param.v; } return NULL; } static void sctp_update_strreset_result(struct sctp_association *asoc, __u32 result) { asoc->strreset_result[1] = asoc->strreset_result[0]; asoc->strreset_result[0] = result; } struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __be16 *str_p = NULL; __u32 request_seq; __u16 i, nums; request_seq = ntohl(outreq->request_seq); if (ntohl(outreq->send_reset_at_tsn) > sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; /* Check strreset_enable after inseq inc, as sender cannot tell * the peer doesn't enable strreset after receiving response with * result denied, as well as to keep consistent with bsd. */ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->incnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, outreq->response_seq, SCTP_PARAM_RESET_IN_REQUEST)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } if (nums) for (i = 0; i < nums; i++) SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; else for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_reset_event(asoc, SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_inreq *inreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq; __u16 i, nums; __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { result = SCTP_STRRESET_IN_PROGRESS; asoc->strreset_inseq--; goto err; } chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; if (nums) for (i = 0; i < nums; i++) SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; struct sctp_strreset_tsnreq *tsnreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; __u16 i; request_seq = ntohl(tsnreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } goto err; } if (!sctp_outq_is_empty(&asoc->outqueue)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } /* G4: The same processing as though a FWD-TSN chunk (as defined in * [RFC3758]) with all streams affected and a new cumulative TSN * ACK of the Receiver's Next TSN minus 1 were received MUST be * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); /* G3: The same processing as though a SACK chunk with no gap report * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); /* G2: Compute an appropriate value for the local endpoint's next TSN, * i.e., the next TSN assigned by the receiver of the SSN/TSN reset * chunk. The value SHOULD be the highest TSN sent by the receiver * of the request plus 1. */ next_tsn = asoc->next_tsn; asoc->ctsn_ack_point = next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn, next_tsn, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_tsnresp(asoc, result, request_seq, next_tsn, init_tsn); } struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq, incnt; __u16 in, i; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; in = ntohs(addstrm->number_of_streams); incnt = stream->incnt + in; if (!in || incnt > SCTP_MAX_STREAM) goto out; if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_change_event(asoc, 0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } out = ntohs(addstrm->number_of_streams); outcnt = stream->outcnt + out; if (!out || outcnt > SCTP_MAX_STREAM) goto out; ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); if (ret) goto out; chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); stream->outcnt = outcnt; result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_stream *stream = &asoc->stream; struct sctp_strreset_resp *resp = param.v; struct sctp_transport *t; __u16 i, nums, flags = 0; struct sctp_paramhdr *req; __u32 result; req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0); if (!req) return NULL; result = ntohl(resp->result); if (result != SCTP_STRRESET_PERFORMED) { /* if in progress, do nothing but retransmit */ if (result == SCTP_STRRESET_IN_PROGRESS) return NULL; else if (result == SCTP_STRRESET_DENIED) flags = SCTP_STREAM_RESET_DENIED; else flags = SCTP_STREAM_RESET_FAILED; } if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { sout = SCTP_SO(stream, ntohs(str_p[i])); sout->mid = 0; sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { sout = SCTP_SO(stream, i); sout->mid = 0; sout->mid_uo = 0; } } } flags |= SCTP_STREAM_RESET_OUTGOING_SSN; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) return NULL; inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / sizeof(__u16); flags |= SCTP_STREAM_RESET_INCOMING_SSN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { struct sctp_strreset_resptsn *resptsn; __u32 stsn, rtsn; /* check for resptsn, as sctp_verify_reconf didn't do it*/ if (ntohs(param.p->length) != sizeof(*resptsn)) return NULL; resptsn = (struct sctp_strreset_resptsn *)resp; stsn = ntohl(resptsn->senders_next_tsn); rtsn = ntohl(resptsn->receivers_next_tsn); if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); LIST_HEAD(temp); asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); /* Clean up sacked and abandoned queues only. As the * out_chunk_list may not be empty, splice it to temp, * then get it back after sctp_outq_free is done. */ list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) { struct sctp_strreset_addstrm *addstrm; __u16 number; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); number = stream->outcnt - nums; if (result == SCTP_STRRESET_PERFORMED) { for (i = number; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; } else { sctp_stream_shrink_out(stream, number); stream->outcnt = number; } *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, 0, nums, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) { struct sctp_strreset_addstrm *addstrm; /* if the result is performed, it's impossible for addstrm in * request. */ if (result == SCTP_STRRESET_PERFORMED) return NULL; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, nums, 0, GFP_ATOMIC); } asoc->strreset_outstanding--; asoc->strreset_outseq++; /* remove everything for this reconf request */ if (!asoc->strreset_outstanding) { t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } return NULL; } |
5024 2315 1581 2154 2139 2598 4478 161 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline int ext4_es_is_delonly(struct extent_status *es) { return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET6_HASHTABLES_H #define _INET6_HASHTABLES_H #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/types.h> #include <linux/jhash.h> #include <net/inet_sock.h> #include <net/ipv6.h> #include <net/netns/hash.h> struct inet_hashinfo; static inline unsigned int __inet6_ehashfn(const u32 lhash, const u16 lport, const u32 fhash, const __be16 fport, const u32 initval) { const u32 ports = (((u32)lport) << 16) | (__force u32)fport; return jhash_3words(lhash, fhash, ports, initval); } /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet6_ehashfn_t)(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport); inet6_ehashfn_t inet6_ehashfn; INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif); struct sock *inet6_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn); static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif, bool *refcounted) { struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, bool *refcounted, inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, bool *refcounted) { struct net *net = dev_net(skb_dst(skb)->dev); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport, refcounted, inet6_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet6_lookup(net, hashinfo, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); int inet6_hash(struct sock *sk); static inline bool inet6_match(struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, const __portpair ports, const int dif, const int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_family != AF_INET6 || sk->sk_portpair != ports || !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) || !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _INET6_HASHTABLES_H */ |
46 45 43 43 42 41 1 1 1 41 35 40 40 35 4 40 46 19 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | // SPDX-License-Identifier: GPL-2.0-or-later /* * GRE over IPv4 demultiplexer driver * * Authors: Dmitry Kozlov (xeb@mail.ru) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/if.h> #include <linux/icmp.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> #include <net/erspan.h> #include <net/icmp.h> #include <net/route.h> #include <net/xfrm.h> static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { if (version >= GREPROTO_MAX) return -EINVAL; return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(gre_add_protocol); int gre_del_protocol(const struct gre_protocol *proto, u8 version) { int ret; if (version >= GREPROTO_MAX) return -EINVAL; ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? 0 : -EBUSY; if (ret) return ret; synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(gre_del_protocol); /* Fills in tpi and returns header length to be pulled. * Note that caller must use pskb_may_pull() before pulling GRE header. */ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs) { const struct gre_base_hdr *greh; __be32 *options; int hdr_len; if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr)))) return -EINVAL; greh = (struct gre_base_hdr *)(skb->data + nhs); if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; tpi->flags = gre_flags_to_tnl_flags(greh->flags); hdr_len = gre_calc_hlen(tpi->flags); if (!pskb_may_pull(skb, nhs + hdr_len)) return -EINVAL; greh = (struct gre_base_hdr *)(skb->data + nhs); tpi->proto = greh->protocol; options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { if (!skb_checksum_simple_validate(skb)) { skb_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } else if (csum_err) { *csum_err = true; return -EINVAL; } options++; } if (greh->flags & GRE_KEY) { tpi->key = *options; options++; } else { tpi->key = 0; } if (unlikely(greh->flags & GRE_SEQ)) { tpi->seq = *options; options++; } else { tpi->seq = 0; } /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IPv4/IPv6 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { u8 _val, *val; val = skb_header_pointer(skb, nhs + hdr_len, sizeof(_val), &_val); if (!val) return -EINVAL; tpi->proto = proto; if ((*val & 0xF0) != 0x40) hdr_len += 4; } tpi->hdr_len = hdr_len; /* ERSPAN ver 1 and 2 protocol sets GRE key field * to 0 and sets the configured key in the * inner erspan header field */ if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) return -EINVAL; ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len); tpi->key = cpu_to_be32(get_session_id(ershdr)); } return hdr_len; } EXPORT_SYMBOL(gre_parse_header); static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; u8 ver; int ret; if (!pskb_may_pull(skb, 12)) goto drop; ver = skb->data[1]&0x7f; if (ver >= GREPROTO_MAX) goto drop; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (!proto || !proto->handler) goto drop_unlock; ret = proto->handler(skb); rcu_read_unlock(); return ret; drop_unlock: rcu_read_unlock(); drop: kfree_skb(skb); return NET_RX_DROP; } static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; int err = 0; if (ver >= GREPROTO_MAX) return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); else err = -EPROTONOSUPPORT; rcu_read_unlock(); return err; } static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, }; static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); return -EAGAIN; } return 0; } static void __exit gre_exit(void) { inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } module_init(gre_init); module_exit(gre_exit); MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_LICENSE("GPL"); |
130 9 5 9 5 3 4 3 7 6 5 1 1 1 4 2 4 7 690 4 2 2 23 8 23 23 23 23 23 23 23 339 333 165 1 25 22 6 25 4 3 1 2 4 200 201 55 148 8 6 2 30 29 29 1 1 29 29 29 29 22 30 2 202 416 42 8 199 4 198 198 198 198 2 5 11 11 3 7 3 2 1 1 2 6 4 4 1 3 3 2 8 30 5 30 4 2 2 4 2 2 5 2 1 7 1 16 15 15 2 1 7 6 6 3 2 1 2 3 4 3 2 1 2 2 4 4 4 4 4 2 4 4 1 4 2 1 2 3 2 1 23 1 1 1 1 3 2 2 177 3091 958 2256 3091 1710 1710 1480 3090 3091 193 482 670 3091 3089 3090 677 678 679 679 678 679 678 679 677 73 677 749 751 961 3088 109 363 472 472 506 288 506 472 506 472 472 472 506 1483 570 869 15 649 648 86 1357 1358 1356 1352 1276 847 846 847 70 70 7 7 6 7 363 362 364 305 307 55 55 55 55 55 1035 71 1035 1029 27 1035 14 14 2 1 10 9 8 13 5 4 13 586 176 436 436 279 39 279 193 169 269 267 269 223 67 67 64 785 739 739 729 773 7 7 586 586 585 295 1 154 1 1 1 59 16 482 231 131 29 1393 185 101 3 570 570 2 15 15 530 3088 2400 712 3089 957 2255 3091 2067 2759 2760 4247 65 4244 724 4066 1223 136 73 73 1 2 2 67 62 63 2 2 19 5 19 571 106 86 106 41 41 11 8 2 3 3 2 9 6 7 97 109 30 109 57 7 7 47 68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> #include <linux/mroute.h> #include <linux/mroute6.h> #include <linux/icmpv6.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> #include <net/sock_reuseport.h> #include <net/bpf_sk_storage.h> #include <trace/events/sock.h> #include <net/tcp.h> #include <net/busy_poll.h> #include <net/phonet/phonet.h> #include <linux/ethtool.h> #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); static void sock_def_write_space_wfree(struct sock *sk); static void sock_def_write_space(struct sock *sk); /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns. */ bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap) { return file_ns_capable(sk->sk_socket->file, user_ns, cap) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces. */ bool sk_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, &init_user_ns, cap); } EXPORT_SYMBOL(sk_capable); /** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ bool sk_net_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ #define _sock_locks(x) \ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ x "27" , x "28" , x "AF_CAN" , \ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { _sock_locks("clock-") }; static const char *const af_family_kern_key_strings[AF_MAX+1] = { _sock_locks("k-sk_lock-") }; static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { _sock_locks("k-slock-") }; static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { _sock_locks("elock-") }; /* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; static struct lock_class_key af_rlock_keys[AF_MAX]; static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; /* Maximal space eaten by iovec or ancillary data plus some space */ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements */ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again. */ sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); void sk_error_report(struct sock *sk) { sk->sk_error_report(sk); switch (sk->sk_family) { case AF_INET: fallthrough; case AF_INET6: trace_inet_sk_error_report(sk); break; default: break; } } EXPORT_SYMBOL(sk_error_report); int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; tv.tv_usec = 0; } else { tv.tv_sec = timeo / HZ; tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); } if (old_timeval) { struct __kernel_old_timeval old_tv; old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; return sizeof(old_tv); } *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } EXPORT_SYMBOL(sock_get_timeout); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval) { if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) return -EINVAL; if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv->tv_sec = tv32.tv_sec; tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; if (optlen < sizeof(old_tv)) return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv->tv_sec = old_tv.tv_sec; tv->tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(*tv)) return -EINVAL; if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } return 0; } EXPORT_SYMBOL(sock_copy_user_timeval); static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval) { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); long val; if (err) return err; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", __func__, current->comm, task_pid_nr(current)); } return 0; } val = MAX_SCHEDULE_TIMEOUT; if ((tv.tv_sec || tv.tv_usec) && (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); WRITE_ONCE(*timeo_p, val); return 0; } static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { case AF_UNSPEC: case AF_UNIX: return false; default: return true; } } static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; if (sock_needs_netstamp(sk) && !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(__sock_queue_rcv_skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { enum skb_drop_reason drop_reason; int err; err = sk_filter(sk, skb); if (err) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto out; } err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; break; case -ENOBUFS: drop_reason = SKB_DROP_REASON_PROTO_MEM; break; default: drop_reason = SKB_NOT_DROPPED_YET; break; } out: if (reason) *reason = drop_reason; return err; } EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; } bh_unlock_sock(sk); out: if (refcounted) sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); goto out; } EXPORT_SYMBOL(__sk_receive_skb); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(__sk_dst_check); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(sk_dst_check); static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); /* Sorry... */ ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; if (ifindex < 0) goto out; /* Paired with all READ_ONCE() done locklessly. */ WRITE_ONCE(sk->sk_bound_dev_if, ifindex); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); ret = 0; out: #endif return ret; } int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; if (lock_sk) lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk) release_sock(sk); return ret; } EXPORT_SYMBOL(sock_bindtoindex); static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, devname); if (dev) index = dev->ifindex; rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; } sockopt_lock_sock(sk); ret = sock_bindtoindex_locked(sk, index); sockopt_release_sock(sk); out: #endif return ret; } static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; if (bound_dev_if == 0) { len = 0; goto zero; } ret = -EINVAL; if (len < IFNAMSIZ) goto out; ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; out: #endif return ret; } bool sk_mc_loop(struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; /* IPV6_ADDRFORM can change sk->sk_family under us. */ switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_sk(sk)->mc_loop; #endif } WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); void sock_set_reuseaddr(struct sock *sk) { lock_sock(sk); sk->sk_reuse = SK_CAN_REUSE; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseaddr); void sock_set_reuseport(struct sock *sk) { lock_sock(sk); sk->sk_reuseport = true; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { lock_sock(sk); WRITE_ONCE(sk->sk_priority, priority); release_sock(sk); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } void sock_enable_timestamps(struct sock *sk) { lock_sock(sk); __sock_set_timestamps(sk, true, false, true); release_sock(sk); } EXPORT_SYMBOL(sock_enable_timestamps); void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { case SO_TIMESTAMP_OLD: __sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW: __sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD: __sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW: __sock_set_timestamps(sk, valbool, true, true); break; } } static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) { struct net *net = sock_net(sk); struct net_device *dev = NULL; bool match = false; int *vclock_index; int i, num; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); if (!dev) { pr_err("%s: sock not bind to device\n", __func__); return -EOPNOTSUPP; } num = ethtool_get_phc_vclocks(dev, &vclock_index); dev_put(dev); for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; break; } } if (num > 0) kfree(vclock_index); if (!match) return -EINVAL; WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping) { int val = timestamping.flags; int ret; if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP && !(val & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP) atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); else atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } } if (val & SOF_TIMESTAMPING_OPT_STATS && !(val & SOF_TIMESTAMPING_OPT_TSONLY)) return -EINVAL; if (val & SOF_TIMESTAMPING_BIND_PHC) { ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); if (ret) return ret; } WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); return 0; } void sock_set_keepalive(struct sock *sk) { lock_sock(sk); if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, true); sock_valbool_flag(sk, SOCK_KEEPOPEN, true); release_sock(sk); } EXPORT_SYMBOL(sock_set_keepalive); static void __sock_set_rcvbuf(struct sock *sk, int val) { /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior. */ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); } void sock_set_rcvbuf(struct sock *sk, int val) { lock_sock(sk); __sock_set_rcvbuf(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } void sock_set_mark(struct sock *sk, u32 val) { lock_sock(sk); __sock_set_mark(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } static int sock_reserve_memory(struct sock *sk, int bytes) { long allocated; bool charged; int pages; if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) return 0; pages = sk_mem_pages(bytes); /* pre-charge to memcg */ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from * a bpf prog. bpf has ensured the sk lock has been * acquired before calling setsockopt(). */ if (has_current_bpf_ctx()) return; lock_sock(sk); } EXPORT_SYMBOL(sockopt_lock_sock); void sockopt_release_sock(struct sock *sk) { if (has_current_bpf_ctx()) return; release_sock(sk); } EXPORT_SYMBOL(sockopt_release_sock); bool sockopt_ns_capable(struct user_namespace *ns, int cap) { return has_current_bpf_ctx() || ns_capable(ns, cap); } EXPORT_SYMBOL(sockopt_ns_capable); bool sockopt_capable(int cap) { return has_current_bpf_ctx() || capable(cap); } EXPORT_SYMBOL(sockopt_capable); /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */ if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: sk->sk_reuseport = valbool; break; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: ret = -ENOPROTOOPT; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ if (val < 0) val = 0; goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ __sock_set_rcvbuf(sk, max(val, 0)); break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; case SO_OOBINLINE: sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break; case SO_NO_CHECK: sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) WRITE_ONCE(sk->sk_priority, val); else ret = -EPERM; break; case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ break; } if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); } else { unsigned long t_sec = ling.l_linger; if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: break; case SO_PASSCRED: assign_bit(SOCK_PASSCRED, &sock->flags, valbool); break; case SO_PASSPIDFD: assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: sock_set_timestamp(sk, optname, valbool); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(×tamping, optval, sizeof(timestamping))) { ret = -EFAULT; break; } } else { memset(×tamping, 0, sizeof(timestamping)); timestamping.flags = val; } ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: { int (*set_rcvlowat)(struct sock *sk, int val) = NULL; if (val < 0) val = INT_MAX; if (sock) set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; if (set_rcvlowat) ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } break; case SO_DETACH_REUSEPORT_BPF: ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) ret = -EPERM; else sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; case SO_PASSSEC: assign_bit(SOCK_PASSSEC, &sock->flags, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; case SO_RCVMARK: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; case SO_WIFI_STATUS: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; case SO_PEEK_OFF: { int (*set_peek_off)(struct sock *sk, int val); set_peek_off = READ_ONCE(sock->ops)->set_peek_off; if (set_peek_off) ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; case SO_SELECT_ERR_QUEUE: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: if (val < 0) ret = -EINVAL; else WRITE_ONCE(sk->sk_ll_usec, val); break; case SO_PREFER_BUSY_POLL: if (valbool && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; case SO_BUSY_POLL_BUDGET: if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; } else { if (val < 0 || val > U16_MAX) ret = -EINVAL; else WRITE_ONCE(sk->sk_busy_poll_budget, val); } break; #endif case SO_MAX_PACING_RATE: { unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && copy_from_sockptr(&ulval, optval, sizeof(ulval))) { ret = -EFAULT; break; } if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); /* Pairs with READ_ONCE() from sk_getsockopt() */ WRITE_ONCE(sk->sk_max_pacing_rate, ulval); sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); break; } case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: if (val == 1) dst_negative_advice(sk); break; case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; break; } /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); sk->sk_txtime_report_errors = !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: ret = sock_bindtoindex_locked(sk, val); break; case SO_BUF_LOCK: if (val & ~SOCK_BUF_LOCK_MASK) { ret = -EINVAL; break; } sk->sk_userlocks = val | (sk->sk_userlocks & ~SOCK_BUF_LOCK_MASK); break; case SO_RESERVE_MEM: { int delta; if (val < 0) { ret = -EINVAL; break; } delta = val - sk->sk_reserved_mem; if (delta < 0) sock_release_reserved_memory(sk, -delta); else ret = sock_reserve_memory(sk, delta); break; } case SO_TXREHASH: if (val < -1 || val > 1) { ret = -EINVAL; break; } if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() * and sk_getsockopt(). */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; default: ret = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return ret; } int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { return sk_setsockopt(sock->sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) { const struct cred *cred; spin_lock(&sk->sk_peer_lock); cred = get_cred(sk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); return cred; } static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; if (cred) { struct user_namespace *current_ns = current_user_ns(); ucred->uid = from_kuid_munged(current_ns, cred->euid); ucred->gid = from_kgid_munged(current_ns, cred->egid); } } static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) { gid_t gid = from_kgid_munged(user_ns, src->gid[i]); if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; } return 0; } int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; union { int val; u64 val64; unsigned long ulval; struct linger ling; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; struct so_timestamping timestamping; } v; int lv = sizeof(int); int len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; memset(&v, 0, sizeof(v)); switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; case SO_DONTROUTE: v.val = sock_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: v.val = sock_flag(sk, SOCK_BROADCAST); break; case SO_SNDBUF: v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: v.val = sk->sk_reuse; break; case SO_REUSEPORT: v.val = sk->sk_reuseport; break; case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; case SO_TYPE: v.val = sk->sk_type; break; case SO_PROTOCOL: v.val = sk->sk_protocol; break; case SO_DOMAIN: v.val = sk->sk_family; break; case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; case SO_OOBINLINE: v.val = sock_flag(sk, SOCK_URGINLINE); break; case SO_NO_CHECK: v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMP_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPNS_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: v.val = 1; break; case SO_PASSCRED: v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; case SO_PEERCRED: { struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } case SO_PEERPIDFD: { struct pid *peer_pid; struct file *pidfd_file = NULL; int pidfd; if (len > sizeof(pidfd)) len = sizeof(pidfd); spin_lock(&sk->sk_peer_lock); peer_pid = get_pid(sk->sk_peer_pid); spin_unlock(&sk->sk_peer_lock); if (!peer_pid) return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { put_unused_fd(pidfd); fput(pidfd_file); return -EFAULT; } fd_install(pidfd, pidfd_file); return 0; } case SO_PEERGROUPS: { const struct cred *cred; int ret, n; cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA; n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; goto lenout; } case SO_PEERNAME: { struct sockaddr_storage address; lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ case SO_ACCEPTCONN: v.val = sk->sk_state == TCP_LISTEN; break; case SO_PASSSEC: v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); case SO_MARK: v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: v.val = sock_flag(sk, SOCK_RCVMARK); break; case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; case SO_WIFI_STATUS: v.val = sock_flag(sk, SOCK_WIFI_STATUS); break; case SO_PEEK_OFF: if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; case SO_BINDTODEVICE: return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: len = sk_get_filter(sk, optval, len); if (len < 0) return len; goto lenout; case SO_LOCK_FILTER: v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; case SO_BPF_EXTENSIONS: v.val = bpf_tell_extensions(); break; case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); break; #endif case SO_MAX_PACING_RATE: /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ v.val = min_t(unsigned long, ~0U, READ_ONCE(sk->sk_max_pacing_rate)); } break; case SO_INCOMING_CPU: v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: { u32 meminfo[SK_MEMINFO_VARS]; sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; } #ifdef CONFIG_NET_RX_BUSY_POLL case SO_INCOMING_NAPI_ID: v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ if (v.val < MIN_NAPI_ID) v.val = 0; break; #endif case SO_COOKIE: lv = sizeof(u64); if (len < lv) return -EINVAL; v.val64 = sock_gen_cookie(sk); break; case SO_ZEROCOPY: v.val = sock_flag(sk, SOCK_ZEROCOPY); break; case SO_TXTIME: lv = sizeof(v.txtime); v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; v.txtime.flags |= sk->sk_txtime_report_errors ? SOF_TXTIME_REPORT_ERRORS : 0; break; case SO_BINDTOIFINDEX: v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: lv = sizeof(u64); if (len != lv) return -EINVAL; v.val64 = sock_net(sk)->net_cookie; break; case SO_BUF_LOCK: v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break; case SO_RESERVE_MEM: v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). */ return -ENOPROTOOPT; } if (len > lv) len = lv; if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { return sk_getsockopt(sock->sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } /* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.) */ static inline void sock_lock_init(struct sock *sk) { if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, af_family_kern_slock_key_strings[sk->sk_family], af_family_kern_slock_keys + sk->sk_family, af_family_kern_key_strings[sk->sk_family], af_family_kern_keys + sk->sk_family); else sock_lock_init_class_and_name( sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], af_family_keys + sk->sk_family); } /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarly, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif /* If we move sk_tx_queue_mapping out of the private section, * we must check if sk_tx_queue_clear() is called after * sock_copy() in sk_clone_lock(). */ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < offsetof(struct sock, sk_dontcopy_begin) || offsetof(struct sock, sk_tx_queue_mapping) >= offsetof(struct sock, sk_dontcopy_end)); memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; struct kmem_cache *slab; slab = prot->slab; if (slab != NULL) { sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; } return sk; out_free_sec: security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); return NULL; } static void sk_prot_free(struct proto *prot, struct sock *sk) { struct kmem_cache *slab; struct module *owner; owner = prot->owner; slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); module_put(owner); } /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; } /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); else __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); sk_prot_free(sk->sk_prot_creator, sk); } void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } static void __sk_free(struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } void sk_free(struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); static void sk_init_common(struct sock *sk) { skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_receive_queue.lock, af_rlock_keys + sk->sk_family, af_family_rlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_write_queue.lock, af_wlock_keys + sk->sk_family, af_family_wlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; bool is_charged = true; struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (!newsk) goto out; sock_copy(newsk, sk); newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } else { /* Kernel sockets are not elevating the struct net refcount. * Instead, use a tracker to more easily detect if a layer * is not properly dismantling its kernel sockets at netns * destroy time. */ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); RCU_INIT_POINTER(newsk->sk_filter, filter); rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter. */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); if (bpf_sk_storage_clone(sk, newsk)) { sk_free_unlock_clone(newsk); newsk = NULL; goto out; } /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. */ if (sk_user_data_is_nocopy(newsk)) newsk->sk_user_data = NULL; newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); /* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); out: return newsk; } EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_free_unlock_clone(struct sock *sk) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ sk->sk_destruct = NULL; bh_unlock_sock(sk); sk_free(sk); } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { bool is_ipv6 = false; u32 max_size; #if IS_ENABLED(CONFIG_IPV6) is_ipv6 = (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : READ_ONCE(dst->dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ /* * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); sock_def_write_space_wfree(sk); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); return; } /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } /* * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); /* This variant of sock_wfree() is used by TCP, * since it sets SOCK_USE_WRITE_QUEUE. */ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) { skb->destructor = sock_edemux; sock_hold(sk); return; } #endif skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ if (skb->decrypted) return false; #endif return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { if (skb_is_tcp_pure_ack(skb)) return; if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) return; skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); /* * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; atomic_sub(len, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); /* * Buffer destructor for skbs that are not used directly in read or write * path, e.g. for error handler skbs. Automatically called from kfree_skb. */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); /* Buffer destructor for prefetch/receive path where reference count may * not be held, e.g. for listen sockets. */ #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { if (sk_is_refcounted(skb->sk)) sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; read_unlock(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(__sock_i_ino); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; local_bh_disable(); ino = __sock_i_ino(sk); local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; } EXPORT_SYMBOL(sock_wmalloc); static void sock_ofree(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_omem_alloc); } struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { struct sk_buff *skb; /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); if (!skb) return NULL; atomic_add(skb->truesize, &sk->sk_omem_alloc); skb->sk = sk; skb->destructor = sock_ofree; return skb; } /* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { int optmem_max = READ_ONCE(sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); if (mem) return mem; atomic_sub(size, &sk->sk_omem_alloc); } return NULL; } EXPORT_SYMBOL(sock_kmalloc); /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. */ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify) { if (WARN_ON_ONCE(!mem)) return; if (nullify) kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } void sock_kfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, false); } EXPORT_SYMBOL(sock_kfree_s); void sock_kzfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, true); } EXPORT_SYMBOL(sock_kzfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Generic send/receive buffer handlers */ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { struct sk_buff *skb; long timeo; int err; timeo = sock_sndtimeo(sk, noblock); for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) goto failure; if (signal_pending(current)) goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation); if (skb) skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; } EXPORT_SYMBOL(sock_alloc_send_pskb); int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; tsflags = *(u32 *)CMSG_DATA(cmsg); if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) return -EINVAL; sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(__sock_cmsg_send); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } return 0; } EXPORT_SYMBOL(sock_cmsg_send); static void sk_enter_memory_pressure(struct sock *sk) { if (!sk->sk_prot->enter_memory_pressure) return; sk->sk_prot->enter_memory_pressure(sk); } static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; if (memory_pressure && READ_ONCE(*memory_pressure)) WRITE_ONCE(*memory_pressure, 0); } } DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; } EXPORT_SYMBOL(sk_page_frag_refill); void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); do { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); skb = next; } while (skb != NULL); spin_lock_bh(&sk->sk_lock.slock); } /* * Doing the zeroing here guarantee we can not loop forever * while a wild producer attempts to flood us. */ sk->sk_backlog.len = 0; } void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate * @amt: pages to allocate * @kind: allocation type * * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; struct proto *prot = sk->sk_prot; bool charged = true; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg_charge && !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge()))) goto suppress_allocation; /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ int wmem0 = sk_get_wmem0(sk, prot); if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < wmem0) return 1; } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; } } if (sk_has_memory_pressure(sk)) { u64 alloc; if (!sk_under_memory_pressure(sk)) return 1; alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) return 1; } suppress_allocation: if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { sk_stream_moderate_sndbuf(sk); /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ if (memcg_charge && !charged) { mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); if (memcg_charge && charged) mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; } /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated * @sk: socket * @size: memory size to allocate * @kind: allocation type * * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means * rmem allocation. This function assumes that protocols which have * memory_pressure use sk_wmem_queued as write buffer accounting. */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket * @amount: number of quanta * * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain * cases where it makes no sense for a protocol to have a "do nothing" * function, some default processing is provided. */ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_shutdown); int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg_locked); int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } EXPORT_SYMBOL(sock_no_mmap); /* * When a file is received (via SCM_RIGHTS, etc), we must bump the * various sock-based usage counts. */ void __receive_sock(struct file *file) { struct socket *sock; sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } } /* * Default Socket Callbacks */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } void sock_def_readable(struct sock *sk) { struct socket_wq *wq; trace_sk_data_ready(sk); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* An optimised version of sock_def_write_space(), should only be called * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ static void sock_def_write_space_wfree(struct sock *sk) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ smp_mb__after_atomic(); if (wq && waitqueue_active(&wq->wait)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } } static void sock_def_destruct(struct sock *sk) { } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) { if (!mod_timer(timer, expires)) sock_hold(sk); } EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { if (del_timer_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); } sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) lockdep_set_class_and_name( &sk->sk_callback_lock, af_kern_callback_keys + sk->sk_family, af_family_kern_clock_key_strings[sk->sk_family]); else lockdep_set_class_and_name( &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; spin_lock_init(&sk->sk_peer_lock); sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; #if BITS_PER_LONG==32 seqlock_init(&sk->sk_stamp_seq); #endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data_uid); void sock_init_data(struct socket *sock, struct sock *sk) { kuid_t uid = sock ? SOCK_INODE(sock)->i_uid : make_kuid(sock_net(sk)->user_ns, 0); sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); /* Warning : release_cb() might need to release sk ownership, * ie call sock_release_ownership(sk) before us. */ if (sk->sk_prot->release_cb) sk->sk_prot->release_cb(sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. * * The 'mutex' is not contended and holding * sock::sk_lock.slock prevents all other lockers to * proceed so the corresponding unlock_sock_fast() can * avoid the slow path of release_sock() completely and * just release slock. * * From a semantical POV this is equivalent to 'acquiring' * the 'mutex', hence the corresponding lockdep * mutex_release() has to happen in the fast path of * unlock_sock_fast(). */ return false; } __lock_sock(sk); sk->sk_lock.owned = 1; __acquire(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock); return true; } EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) { struct sock *sk = sock->sk; struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } if (timeval) ts.tv_nsec /= 1000; #ifdef CONFIG_COMPAT_32BIT_TIME if (time32) return put_old_timespec32(&ts, userstamp); #endif #ifdef CONFIG_SPARC64 /* beware of padding in sparc64 timeval */ if (timeval && !in_compat_syscall()) { struct __kernel_old_timeval __user tv = { .tv_sec = ts.tv_sec, .tv_usec = ts.tv_nsec, }; if (copy_to_user(userstamp, &tv, sizeof(tv))) return -EFAULT; return 0; } #endif return put_timespec64(&ts, userstamp); } EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ if (sock_needs_netstamp(sk) && !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; struct sk_buff *skb; int copied, err; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; out_free_skb: kfree_skb(skb); out: return err; } EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume * this means if you specify SO_ERROR (otherwise whats the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * * A. Remove from hash tables. */ sk->sk_prot->unhash(sk); /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and * did hash table lookup before we unhashed socket. They will achieve * receive queue and will be purged by socket destructor. * * Also we still have packets pending on receive queue and probably, * our own packets waiting in device queues. sock_destroy will drain * receive queue, but transmitted packets will delay socket destruction * until the last reference will be released. */ sock_orphan(sk); xfrm_sk_free_policy(sk); sock_put(sk); } EXPORT_SYMBOL(sk_common_release); void sk_get_meminfo(const struct sock *sk, u32 *mem) { memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } #ifdef CONFIG_PROC_FS static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; int res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } EXPORT_SYMBOL_GPL(sock_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; return 0; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, }; static __init int net_inuse_init(void) { if (register_pernet_subsys(&net_inuse_ops)) panic("Cannot initialize net inuse counters"); return 0; } core_initcall(net_inuse_init); static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); return 0; } static void release_proto_idx(struct proto *prot) { if (prot->inuse_idx != PROTO_INUSE_NR - 1) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else static inline int assign_proto_idx(struct proto *prot) { return 0; } static inline void release_proto_idx(struct proto *prot) { } #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) { if (!twsk_prot) return; kfree(twsk_prot->twsk_slab_name); twsk_prot->twsk_slab_name = NULL; kmem_cache_destroy(twsk_prot->twsk_slab); twsk_prot->twsk_slab = NULL; } static int tw_prot_init(const struct proto *prot) { struct timewait_sock_ops *twsk_prot = prot->twsk_prot; if (!twsk_prot) return 0; twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (!twsk_prot->twsk_slab_name) return -ENOMEM; twsk_prot->twsk_slab = kmem_cache_create(twsk_prot->twsk_slab_name, twsk_prot->twsk_obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!twsk_prot->twsk_slab) { pr_crit("%s: Can't create timewait sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; kmem_cache_destroy(rsk_prot->slab); rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) { struct request_sock_ops *rsk_prot = prot->rsk_prot; if (!rsk_prot) return 0; rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (!rsk_prot->slab_name) return -ENOMEM; rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; if (prot->memory_allocated && !prot->sysctl_mem) { pr_err("%s: missing sysctl_mem\n", prot->name); return -EINVAL; } if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); return -EINVAL; } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (req_prot_init(prot)) goto out_free_request_sock_slab; if (tw_prot_init(prot)) goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; out_free_timewait_sock_slab: if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); kmem_cache_destroy(prot->slab); prot->slab = NULL; } out: return ret; } EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); mutex_unlock(&proto_list_mutex); kmem_cache_destroy(prot->slab); prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); int sock_load_diag_module(int family, int protocol) { if (!protocol) { if (!sock_is_registered(family)) return -ENOENT; return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family); } #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family, protocol); } EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) { mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) __releases(proto_list_mutex) { mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } static long sock_prot_memory_allocated(struct proto *proto) { return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; } static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_memory_allocated(proto), sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), proto_method_implemented(proto->close), proto_method_implemented(proto->connect), proto_method_implemented(proto->disconnect), proto_method_implemented(proto->accept), proto_method_implemented(proto->ioctl), proto_method_implemented(proto->init), proto_method_implemented(proto->destroy), proto_method_implemented(proto->shutdown), proto_method_implemented(proto->setsockopt), proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), proto_method_implemented(proto->unhash), proto_method_implemented(proto->get_port), proto_method_implemented(proto->enter_memory_pressure)); } static int proto_seq_show(struct seq_file *seq, void *v) { if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", "sockets", "memory", "press", "maxhdr", "slab", "module", "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, .show = proto_seq_show, }; static __net_init int proto_init_net(struct net *net) { if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static __net_exit void proto_exit_net(struct net *net) { remove_proc_entry("protocols", net->proc_net); } static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, }; static int __init proto_init(void) { return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ #ifdef CONFIG_NET_RX_BUSY_POLL bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; return !skb_queue_empty_lockless(&sk->sk_receive_queue) || sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); /* Copy 'size' bytes from userspace and return `size` back to userspace */ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size) { int ret; if (copy_from_user(karg, arg, size)) return -EFAULT; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); if (ret) return ret; if (copy_to_user(arg, karg, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(sock_ioctl_inout); /* This is the most common ioctl prep function, where the result (4 bytes) is * copied back to userspace if the ioctl() returns successfully. No input is * copied from userspace as input argument. */ static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) { int ret, karg = 0; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); if (ret) return ret; return put_user(karg, (int __user *)arg); } /* A wrapper around sock ioctls, which copies the data from userspace * (depending on the protocol/ioctl), and copies back the result to userspace. * The main motivation for this function is to pass kernel memory to the * protocol ioctl callbacks, instead of userspace memory. */ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int rc = 1; if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) rc = ipmr_sk_ioctl(sk, cmd, arg); else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) rc = ip6mr_sk_ioctl(sk, cmd, arg); else if (sk_is_phonet(sk)) rc = phonet_sk_ioctl(sk, cmd, arg); /* If ioctl was processed, returns its value */ if (rc <= 0) return rc; /* Otherwise call the default handler */ return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl); |
53 37 53 37 47 2 89 43 89 41 88 4 89 48 89 3 99 354 76 354 353 354 354 233 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GRE_H #define __LINUX_GRE_H #include <linux/skbuff.h> #include <net/ip_tunnels.h> struct gre_base_hdr { __be16 flags; __be16 protocol; } __packed; struct gre_full_hdr { struct gre_base_hdr fixed_header; __be16 csum; __be16 reserved1; __be32 key; __be32 seq; } __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 #define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); }; int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); static inline bool netif_is_gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "gretap"); } static inline bool netif_is_ip6gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } static inline int gre_calc_hlen(__be16 o_flags) { int addend = 4; if (o_flags & TUNNEL_CSUM) addend += 4; if (o_flags & TUNNEL_KEY) addend += 4; if (o_flags & TUNNEL_SEQ) addend += 4; return addend; } static inline __be16 gre_flags_to_tnl_flags(__be16 flags) { __be16 tflags = 0; if (flags & GRE_CSUM) tflags |= TUNNEL_CSUM; if (flags & GRE_ROUTING) tflags |= TUNNEL_ROUTING; if (flags & GRE_KEY) tflags |= TUNNEL_KEY; if (flags & GRE_SEQ) tflags |= TUNNEL_SEQ; if (flags & GRE_STRICT) tflags |= TUNNEL_STRICT; if (flags & GRE_REC) tflags |= TUNNEL_REC; if (flags & GRE_VERSION) tflags |= TUNNEL_VERSION; return tflags; } static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) { __be16 flags = 0; if (tflags & TUNNEL_CSUM) flags |= GRE_CSUM; if (tflags & TUNNEL_ROUTING) flags |= GRE_ROUTING; if (tflags & TUNNEL_KEY) flags |= GRE_KEY; if (tflags & TUNNEL_SEQ) flags |= GRE_SEQ; if (tflags & TUNNEL_STRICT) flags |= GRE_STRICT; if (tflags & TUNNEL_REC) flags |= GRE_REC; if (tflags & TUNNEL_VERSION) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { struct gre_base_hdr *greh; skb_push(skb, hdr_len); skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); if (flags & TUNNEL_SEQ) { *ptr = seq; ptr--; } if (flags & TUNNEL_KEY) { *ptr = key; ptr--; } if (flags & TUNNEL_CSUM && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; if (skb->ip_summed == CHECKSUM_PARTIAL) { *(__sum16 *)ptr = csum_fold(lco_csum(skb)); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } } } #endif |
1 2437 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed); |
4303 4304 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template.c * Helpers to manage template descriptors. */ #include <linux/rculist.h> #include "ima.h" #include "ima_template_lib.h" enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME, HDR_TEMPLATE_DATA, HDR__LAST }; static struct ima_template_desc builtin_templates[] = { {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT}, {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "ima-ngv2", .fmt = "d-ngv2|n-ng"}, {.name = "ima-sigv2", .fmt = "d-ngv2|n-ng|sig"}, {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"}, {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"}, {.name = "evm-sig", .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); static int template_setup_done; static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, .field_show = ima_show_template_digest}, {.field_id = "n", .field_init = ima_eventname_init, .field_show = ima_show_template_string}, {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init, .field_show = ima_show_template_digest_ng}, {.field_id = "d-ngv2", .field_init = ima_eventdigest_ngv2_init, .field_show = ima_show_template_digest_ngv2}, {.field_id = "n-ng", .field_init = ima_eventname_ng_init, .field_show = ima_show_template_string}, {.field_id = "sig", .field_init = ima_eventsig_init, .field_show = ima_show_template_sig}, {.field_id = "buf", .field_init = ima_eventbuf_init, .field_show = ima_show_template_buf}, {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init, .field_show = ima_show_template_digest_ng}, {.field_id = "modsig", .field_init = ima_eventmodsig_init, .field_show = ima_show_template_sig}, {.field_id = "evmsig", .field_init = ima_eventevmsig_init, .field_show = ima_show_template_sig}, {.field_id = "iuid", .field_init = ima_eventinodeuid_init, .field_show = ima_show_template_uint}, {.field_id = "igid", .field_init = ima_eventinodegid_init, .field_show = ima_show_template_uint}, {.field_id = "imode", .field_init = ima_eventinodemode_init, .field_show = ima_show_template_uint}, {.field_id = "xattrnames", .field_init = ima_eventinodexattrnames_init, .field_show = ima_show_template_string}, {.field_id = "xattrlengths", .field_init = ima_eventinodexattrlengths_init, .field_show = ima_show_template_sig}, {.field_id = "xattrvalues", .field_init = ima_eventinodexattrvalues_init, .field_show = ima_show_template_sig}, }; /* * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't * need to be accounted for since they shouldn't be defined in the same template * description as 'd-ng' and 'n-ng' respectively. */ #define MAX_TEMPLATE_NAME_LEN \ sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode") static struct ima_template_desc *ima_template; static struct ima_template_desc *ima_buf_template; /** * ima_template_has_modsig - Check whether template has modsig-related fields. * @ima_template: IMA template to check. * * Tells whether the given template has fields referencing a file's appended * signature. */ bool ima_template_has_modsig(const struct ima_template_desc *ima_template) { int i; for (i = 0; i < ima_template->num_fields; i++) if (!strcmp(ima_template->fields[i]->field_id, "modsig") || !strcmp(ima_template->fields[i]->field_id, "d-modsig")) return true; return false; } static int __init ima_template_setup(char *str) { struct ima_template_desc *template_desc; int template_len = strlen(str); if (template_setup_done) return 1; if (!ima_template) ima_init_template_list(); /* * Verify that a template with the supplied name exists. * If not, use CONFIG_IMA_DEFAULT_TEMPLATE. */ template_desc = lookup_template_desc(str); if (!template_desc) { pr_err("template %s not found, using %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } /* * Verify whether the current hash algorithm is supported * by the 'ima' template. */ if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 && ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) { pr_err("template does not support hash alg\n"); return 1; } ima_template = template_desc; template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { pr_err("format string '%s' not valid, using template %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; template_setup_done = 1; return 1; } __setup("ima_template_fmt=", ima_template_fmt_setup); struct ima_template_desc *lookup_template_desc(const char *name) { struct ima_template_desc *template_desc; int found = 0; rcu_read_lock(); list_for_each_entry_rcu(template_desc, &defined_templates, list) { if ((strcmp(template_desc->name, name) == 0) || (strcmp(template_desc->fmt, name) == 0)) { found = 1; break; } } rcu_read_unlock(); return found ? template_desc : NULL; } static const struct ima_template_field * lookup_template_field(const char *field_id) { int i; for (i = 0; i < ARRAY_SIZE(supported_fields); i++) if (strncmp(supported_fields[i].field_id, field_id, IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0) return &supported_fields[i]; return NULL; } static int template_fmt_size(const char *template_fmt) { char c; int template_fmt_len = strlen(template_fmt); int i = 0, j = 0; while (i < template_fmt_len) { c = template_fmt[i]; if (c == '|') j++; i++; } return j + 1; } int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields) { const char *template_fmt_ptr; const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; int template_num_fields; int i, len; if (num_fields && *num_fields > 0) /* already initialized? */ return 0; template_num_fields = template_fmt_size(template_fmt); if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) { pr_err("format string '%s' contains too many fields\n", template_fmt); return -EINVAL; } for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields; i++, template_fmt_ptr += len + 1) { char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1]; len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr; if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) { pr_err("Invalid field with length %d\n", len); return -EINVAL; } memcpy(tmp_field_id, template_fmt_ptr, len); tmp_field_id[len] = '\0'; found_fields[i] = lookup_template_field(tmp_field_id); if (!found_fields[i]) { pr_err("field '%s' not found\n", tmp_field_id); return -ENOENT; } } if (fields && num_fields) { *fields = kmalloc_array(i, sizeof(**fields), GFP_KERNEL); if (*fields == NULL) return -ENOMEM; memcpy(*fields, found_fields, i * sizeof(**fields)); *num_fields = i; } return 0; } void ima_init_template_list(void) { int i; if (!list_empty(&defined_templates)) return; spin_lock(&template_list); for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) { list_add_tail_rcu(&builtin_templates[i].list, &defined_templates); } spin_unlock(&template_list); } struct ima_template_desc *ima_template_desc_current(void) { if (!ima_template) { ima_init_template_list(); ima_template = lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE); } return ima_template; } struct ima_template_desc *ima_template_desc_buf(void) { if (!ima_buf_template) { ima_init_template_list(); ima_buf_template = lookup_template_desc("ima-buf"); } return ima_buf_template; } int __init ima_init_template(void) { struct ima_template_desc *template = ima_template_desc_current(); int result; result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) { pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } template = ima_template_desc_buf(); if (!template) { pr_err("Failed to get ima-buf template\n"); return -EINVAL; } result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } static struct ima_template_desc *restore_template_fmt(char *template_name) { struct ima_template_desc *template_desc = NULL; int ret; ret = template_desc_init_fields(template_name, NULL, NULL); if (ret < 0) { pr_err("attempting to initialize the template \"%s\" failed\n", template_name); goto out; } template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL); if (!template_desc) goto out; template_desc->name = ""; template_desc->fmt = kstrdup(template_name, GFP_KERNEL); if (!template_desc->fmt) { kfree(template_desc); template_desc = NULL; goto out; } spin_lock(&template_list); list_add_tail_rcu(&template_desc->list, &defined_templates); spin_unlock(&template_list); out: return template_desc; } static int ima_restore_template_data(struct ima_template_desc *template_desc, void *template_data, int template_data_size, struct ima_template_entry **entry) { struct tpm_digest *digests; int ret = 0; int i; *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); return -ENOMEM; } (*entry)->digests = digests; ret = ima_parse_buf(template_data, template_data + template_data_size, NULL, template_desc->num_fields, (*entry)->template_data, NULL, NULL, ENFORCE_FIELDS | ENFORCE_BUFEND, "template data"); if (ret < 0) { kfree((*entry)->digests); kfree(*entry); return ret; } (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { struct ima_field_data *field_data = &(*entry)->template_data[i]; u8 *data = field_data->data; (*entry)->template_data[i].data = kzalloc(field_data->len + 1, GFP_KERNEL); if (!(*entry)->template_data[i].data) { ret = -ENOMEM; break; } memcpy((*entry)->template_data[i].data, data, field_data->len); (*entry)->template_data_len += sizeof(field_data->len); (*entry)->template_data_len += field_data->len; } if (ret < 0) { ima_free_template_entry(*entry); *entry = NULL; } return ret; } /* Restore the serialized binary measurement list without extending PCRs. */ int ima_restore_measurement_list(loff_t size, void *buf) { char template_name[MAX_TEMPLATE_NAME_LEN]; unsigned char zero[TPM_DIGEST_SIZE] = { 0 }; struct ima_kexec_hdr *khdr = buf; struct ima_field_data hdr[HDR__LAST] = { [HDR_PCR] = {.len = sizeof(u32)}, [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE}, }; void *bufp = buf + sizeof(*khdr); void *bufendp; struct ima_template_entry *entry; struct ima_template_desc *template_desc; DECLARE_BITMAP(hdr_mask, HDR__LAST); unsigned long count = 0; int ret = 0; if (!buf || size < sizeof(*khdr)) return 0; if (ima_canonical_fmt) { khdr->version = le16_to_cpu((__force __le16)khdr->version); khdr->count = le64_to_cpu((__force __le64)khdr->count); khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size); } if (khdr->version != 1) { pr_err("attempting to restore a incompatible measurement list"); return -EINVAL; } if (khdr->count > ULONG_MAX - 1) { pr_err("attempting to restore too many measurements"); return -EINVAL; } bitmap_zero(hdr_mask, HDR__LAST); bitmap_set(hdr_mask, HDR_PCR, 1); bitmap_set(hdr_mask, HDR_DIGEST, 1); /* * ima kexec buffer prefix: version, buffer size, count * v1 format: pcr, digest, template-name-len, template-name, * template-data-size, template-data */ bufendp = buf + khdr->buffer_size; while ((bufp < bufendp) && (count++ < khdr->count)) { int enforce_mask = ENFORCE_FIELDS; enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0; ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL, hdr_mask, enforce_mask, "entry header"); if (ret < 0) break; if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) { pr_err("attempting to restore a template name that is too long\n"); ret = -EINVAL; break; } /* template name is not null terminated */ memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data, hdr[HDR_TEMPLATE_NAME].len); template_name[hdr[HDR_TEMPLATE_NAME].len] = 0; if (strcmp(template_name, "ima") == 0) { pr_err("attempting to restore an unsupported template \"%s\" failed\n", template_name); ret = -EINVAL; break; } template_desc = lookup_template_desc(template_name); if (!template_desc) { template_desc = restore_template_fmt(template_name); if (!template_desc) break; } /* * Only the running system's template format is initialized * on boot. As needed, initialize the other template formats. */ ret = template_desc_init_fields(template_desc->fmt, &(template_desc->fields), &(template_desc->num_fields)); if (ret < 0) { pr_err("attempting to restore the template fmt \"%s\" failed\n", template_desc->fmt); ret = -EINVAL; break; } ret = ima_restore_template_data(template_desc, hdr[HDR_TEMPLATE_DATA].data, hdr[HDR_TEMPLATE_DATA].len, &entry); if (ret < 0) break; if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) { ret = ima_calc_field_array_hash( &entry->template_data[0], entry); if (ret < 0) { pr_err("cannot calculate template digest\n"); ret = -EINVAL; break; } } entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) : le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data)); ret = ima_restore_measurement_entry(entry); if (ret < 0) break; } return ret; } |
1074 | 1 2 3 4 5 6 7 8 9 10 11 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NS_HASH_H__ #define __NET_NS_HASH_H__ #include <net/net_namespace.h> static inline u32 net_hash_mix(const struct net *net) { return net->hash_mix; } #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 | /* * Copyright (c) 1982, 1986 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _LINUX_QUOTA_ #define _LINUX_QUOTA_ #include <linux/list.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/wait.h> #include <linux/percpu_counter.h> #include <linux/dqblk_xfs.h> #include <linux/dqblk_v1.h> #include <linux/dqblk_v2.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/projid.h> #include <uapi/linux/quota.h> #undef USRQUOTA #undef GRPQUOTA #undef PRJQUOTA enum quota_type { USRQUOTA = 0, /* element used for user quotas */ GRPQUOTA = 1, /* element used for group quotas */ PRJQUOTA = 2, /* element used for project quotas */ }; /* Masks for quota types when used as a bitmask */ #define QTYPE_MASK_USR (1 << USRQUOTA) #define QTYPE_MASK_GRP (1 << GRPQUOTA) #define QTYPE_MASK_PRJ (1 << PRJQUOTA) typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ typedef long long qsize_t; /* Type in which we store sizes */ struct kqid { /* Type in which we store the quota identifier */ union { kuid_t uid; kgid_t gid; kprojid_t projid; }; enum quota_type type; /* USRQUOTA (uid) or GRPQUOTA (gid) or PRJQUOTA (projid) */ }; extern bool qid_eq(struct kqid left, struct kqid right); extern bool qid_lt(struct kqid left, struct kqid right); extern qid_t from_kqid(struct user_namespace *to, struct kqid qid); extern qid_t from_kqid_munged(struct user_namespace *to, struct kqid qid); extern bool qid_valid(struct kqid qid); /** * make_kqid - Map a user-namespace, type, qid tuple into a kqid. * @from: User namespace that the qid is in * @type: The type of quota * @qid: Quota identifier * * Maps a user-namespace, type qid tuple into a kernel internal * kqid, and returns that kqid. * * When there is no mapping defined for the user-namespace, type, * qid tuple an invalid kqid is returned. Callers are expected to * test for and handle invalid kqids being returned. * Invalid kqids may be tested for using qid_valid(). */ static inline struct kqid make_kqid(struct user_namespace *from, enum quota_type type, qid_t qid) { struct kqid kqid; kqid.type = type; switch (type) { case USRQUOTA: kqid.uid = make_kuid(from, qid); break; case GRPQUOTA: kqid.gid = make_kgid(from, qid); break; case PRJQUOTA: kqid.projid = make_kprojid(from, qid); break; default: BUG(); } return kqid; } /** * make_kqid_invalid - Explicitly make an invalid kqid * @type: The type of quota identifier * * Returns an invalid kqid with the specified type. */ static inline struct kqid make_kqid_invalid(enum quota_type type) { struct kqid kqid; kqid.type = type; switch (type) { case USRQUOTA: kqid.uid = INVALID_UID; break; case GRPQUOTA: kqid.gid = INVALID_GID; break; case PRJQUOTA: kqid.projid = INVALID_PROJID; break; default: BUG(); } return kqid; } /** * make_kqid_uid - Make a kqid from a kuid * @uid: The kuid to make the quota identifier from */ static inline struct kqid make_kqid_uid(kuid_t uid) { struct kqid kqid; kqid.type = USRQUOTA; kqid.uid = uid; return kqid; } /** * make_kqid_gid - Make a kqid from a kgid * @gid: The kgid to make the quota identifier from */ static inline struct kqid make_kqid_gid(kgid_t gid) { struct kqid kqid; kqid.type = GRPQUOTA; kqid.gid = gid; return kqid; } /** * make_kqid_projid - Make a kqid from a projid * @projid: The kprojid to make the quota identifier from */ static inline struct kqid make_kqid_projid(kprojid_t projid) { struct kqid kqid; kqid.type = PRJQUOTA; kqid.projid = projid; return kqid; } /** * qid_has_mapping - Report if a qid maps into a user namespace. * @ns: The user namespace to see if a value maps into. * @qid: The kernel internal quota identifier to test. */ static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) { return from_kqid(ns, qid) != (qid_t) -1; } extern spinlock_t dq_data_lock; /* Maximal numbers of writes for quota operation (insert/delete/update) * (over VFS all formats) */ #define DQUOT_INIT_ALLOC max(V1_INIT_ALLOC, V2_INIT_ALLOC) #define DQUOT_INIT_REWRITE max(V1_INIT_REWRITE, V2_INIT_REWRITE) #define DQUOT_DEL_ALLOC max(V1_DEL_ALLOC, V2_DEL_ALLOC) #define DQUOT_DEL_REWRITE max(V1_DEL_REWRITE, V2_DEL_REWRITE) /* * Data for one user/group kept in memory */ struct mem_dqblk { qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */ qsize_t dqb_curspace; /* current used space */ qsize_t dqb_rsvspace; /* current reserved space for delalloc*/ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */ qsize_t dqb_isoftlimit; /* preferred inode limit */ qsize_t dqb_curinodes; /* current # allocated inodes */ time64_t dqb_btime; /* time limit for excessive disk use */ time64_t dqb_itime; /* time limit for excessive inode use */ }; /* * Data for one quotafile kept in memory */ struct quota_format_type; struct mem_dqinfo { struct quota_format_type *dqi_format; int dqi_fmt_id; /* Id of the dqi_format - used when turning * quotas on after remount RW */ struct list_head dqi_dirty_list; /* List of dirty dquots [dq_list_lock] */ unsigned long dqi_flags; /* DFQ_ flags [dq_data_lock] */ unsigned int dqi_bgrace; /* Space grace time [dq_data_lock] */ unsigned int dqi_igrace; /* Inode grace time [dq_data_lock] */ qsize_t dqi_max_spc_limit; /* Maximum space limit [static] */ qsize_t dqi_max_ino_limit; /* Maximum inode limit [static] */ void *dqi_priv; }; struct super_block; /* Mask for flags passed to userspace */ #define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE) /* Mask for flags modifiable from userspace */ #define DQF_SETINFO_MASK DQF_ROOT_SQUASH enum { DQF_INFO_DIRTY_B = DQF_PRIVATE, }; #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B) /* Is info dirty? */ extern void mark_info_dirty(struct super_block *sb, int type); static inline int info_dirty(struct mem_dqinfo *info) { return test_bit(DQF_INFO_DIRTY_B, &info->dqi_flags); } enum { DQST_LOOKUPS, DQST_DROPS, DQST_READS, DQST_WRITES, DQST_CACHE_HITS, DQST_ALLOC_DQUOTS, DQST_FREE_DQUOTS, DQST_SYNCS, _DQST_DQSTAT_LAST }; struct dqstats { unsigned long stat[_DQST_DQSTAT_LAST]; struct percpu_counter counter[_DQST_DQSTAT_LAST]; }; extern struct dqstats dqstats; static inline void dqstats_inc(unsigned int type) { percpu_counter_inc(&dqstats.counter[type]); } static inline void dqstats_dec(unsigned int type) { percpu_counter_dec(&dqstats.counter[type]); } #define DQ_MOD_B 0 /* dquot modified since read */ #define DQ_BLKS_B 1 /* uid/gid has been warned about blk limit */ #define DQ_INODES_B 2 /* uid/gid has been warned about inode limit */ #define DQ_FAKE_B 3 /* no limits only usage */ #define DQ_READ_B 4 /* dquot was read into memory */ #define DQ_ACTIVE_B 5 /* dquot is active (dquot_release not called) */ #define DQ_LASTSET_B 6 /* Following 6 bits (see QIF_) are reserved\ * for the mask of entries set via SETQUOTA\ * quotactl. They are set under dq_data_lock\ * and the quota format handling dquot can\ * clear them when it sees fit. */ struct dquot { struct hlist_node dq_hash; /* Hash list in memory [dq_list_lock] */ struct list_head dq_inuse; /* List of all quotas [dq_list_lock] */ struct list_head dq_free; /* Free list element [dq_list_lock] */ struct list_head dq_dirty; /* List of dirty dquots [dq_list_lock] */ struct mutex dq_lock; /* dquot IO lock */ spinlock_t dq_dqb_lock; /* Lock protecting dq_dqb changes */ atomic_t dq_count; /* Use count */ struct super_block *dq_sb; /* superblock this applies to */ struct kqid dq_id; /* ID this applies to (uid, gid, projid) */ loff_t dq_off; /* Offset of dquot on disk [dq_lock, stable once set] */ unsigned long dq_flags; /* See DQ_* */ struct mem_dqblk dq_dqb; /* Diskquota usage [dq_dqb_lock] */ }; /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ int (*read_file_info)(struct super_block *sb, int type); /* Read main info about file - called on quotaon() */ int (*write_file_info)(struct super_block *sb, int type); /* Write main info about file */ int (*free_file_info)(struct super_block *sb, int type); /* Called on quotaoff() */ int (*read_dqblk)(struct dquot *dquot); /* Read structure for one user */ int (*commit_dqblk)(struct dquot *dquot); /* Write structure for one user */ int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ int (*get_next_id)(struct super_block *sb, struct kqid *qid); /* Get next ID with existing structure in the quota file */ }; /* Operations working with dquots */ struct dquot_operations { int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ int (*acquire_dquot) (struct dquot *); /* Quota is going to be created on disk */ int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */ /* Get number of inodes that were charged for a given inode */ int (*get_inode_usage) (struct inode *, qsize_t *); /* Get next ID with active quota structure */ int (*get_next_id) (struct super_block *sb, struct kqid *qid); }; struct path; /* Structure for communicating via ->get_dqblk() & ->set_dqblk() */ struct qc_dqblk { int d_fieldmask; /* mask of fields to change in ->set_dqblk() */ u64 d_spc_hardlimit; /* absolute limit on used space */ u64 d_spc_softlimit; /* preferred limit on used space */ u64 d_ino_hardlimit; /* maximum # allocated inodes */ u64 d_ino_softlimit; /* preferred inode limit */ u64 d_space; /* Space owned by the user */ u64 d_ino_count; /* # inodes owned by the user */ s64 d_ino_timer; /* zero if within inode limits */ /* if not, we refuse service */ s64 d_spc_timer; /* similar to above; for space */ int d_ino_warns; /* # warnings issued wrt num inodes */ int d_spc_warns; /* # warnings issued wrt used space */ u64 d_rt_spc_hardlimit; /* absolute limit on realtime space */ u64 d_rt_spc_softlimit; /* preferred limit on RT space */ u64 d_rt_space; /* realtime space owned */ s64 d_rt_spc_timer; /* similar to above; for RT space */ int d_rt_spc_warns; /* # warnings issued wrt RT space */ }; /* * Field specifiers for ->set_dqblk() in struct qc_dqblk and also for * ->set_info() in struct qc_info */ #define QC_INO_SOFT (1<<0) #define QC_INO_HARD (1<<1) #define QC_SPC_SOFT (1<<2) #define QC_SPC_HARD (1<<3) #define QC_RT_SPC_SOFT (1<<4) #define QC_RT_SPC_HARD (1<<5) #define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \ QC_RT_SPC_SOFT | QC_RT_SPC_HARD) #define QC_SPC_TIMER (1<<6) #define QC_INO_TIMER (1<<7) #define QC_RT_SPC_TIMER (1<<8) #define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER) #define QC_SPC_WARNS (1<<9) #define QC_INO_WARNS (1<<10) #define QC_RT_SPC_WARNS (1<<11) #define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS) #define QC_SPACE (1<<12) #define QC_INO_COUNT (1<<13) #define QC_RT_SPACE (1<<14) #define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE) #define QC_FLAGS (1<<15) #define QCI_SYSFILE (1 << 0) /* Quota file is hidden from userspace */ #define QCI_ROOT_SQUASH (1 << 1) /* Root squash turned on */ #define QCI_ACCT_ENABLED (1 << 2) /* Quota accounting enabled */ #define QCI_LIMITS_ENFORCED (1 << 3) /* Quota limits enforced */ /* Structures for communicating via ->get_state */ struct qc_type_state { unsigned int flags; /* Flags QCI_* */ unsigned int spc_timelimit; /* Time after which space softlimit is * enforced */ unsigned int ino_timelimit; /* Ditto for inode softlimit */ unsigned int rt_spc_timelimit; /* Ditto for real-time space */ unsigned int spc_warnlimit; /* Limit for number of space warnings */ unsigned int ino_warnlimit; /* Ditto for inodes */ unsigned int rt_spc_warnlimit; /* Ditto for real-time space */ unsigned long long ino; /* Inode number of quota file */ blkcnt_t blocks; /* Number of 512-byte blocks in the file */ blkcnt_t nextents; /* Number of extents in the file */ }; struct qc_state { unsigned int s_incoredqs; /* Number of dquots in core */ struct qc_type_state s_state[MAXQUOTAS]; /* Per quota type information */ }; /* Structure for communicating via ->set_info */ struct qc_info { int i_fieldmask; /* mask of fields to change in ->set_info() */ unsigned int i_flags; /* Flags QCI_* */ unsigned int i_spc_timelimit; /* Time after which space softlimit is * enforced */ unsigned int i_ino_timelimit; /* Ditto for inode softlimit */ unsigned int i_rt_spc_timelimit;/* Ditto for real-time space */ unsigned int i_spc_warnlimit; /* Limit for number of space warnings */ unsigned int i_ino_warnlimit; /* Limit for number of inode warnings */ unsigned int i_rt_spc_warnlimit; /* Ditto for real-time space */ }; /* Operations handling requests from userspace */ struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, const struct path *); int (*quota_off)(struct super_block *, int); int (*quota_enable)(struct super_block *, unsigned int); int (*quota_disable)(struct super_block *, unsigned int); int (*quota_sync)(struct super_block *, int); int (*set_info)(struct super_block *, int, struct qc_info *); int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_nextdqblk)(struct super_block *, struct kqid *, struct qc_dqblk *); int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_state)(struct super_block *, struct qc_state *); int (*rm_xquota)(struct super_block *, unsigned int); }; struct quota_format_type { int qf_fmt_id; /* Quota format id */ const struct quota_format_ops *qf_ops; /* Operations of format */ struct module *qf_owner; /* Module implementing quota format */ struct quota_format_type *qf_next; }; /** * Quota state flags - they come in three flavors - for users, groups and projects. * * Actual typed flags layout: * USRQUOTA GRPQUOTA PRJQUOTA * DQUOT_USAGE_ENABLED 0x0001 0x0002 0x0004 * DQUOT_LIMITS_ENABLED 0x0008 0x0010 0x0020 * DQUOT_SUSPENDED 0x0040 0x0080 0x0100 * * Following bits are used for non-typed flags: * DQUOT_QUOTA_SYS_FILE 0x0200 * DQUOT_NEGATIVE_USAGE 0x0400 * DQUOT_NOLIST_DIRTY 0x0800 */ enum { _DQUOT_USAGE_ENABLED = 0, /* Track disk usage for users */ _DQUOT_LIMITS_ENABLED, /* Enforce quota limits for users */ _DQUOT_SUSPENDED, /* User diskquotas are off, but * we have necessary info in * memory to turn them on */ _DQUOT_STATE_FLAGS }; #define DQUOT_USAGE_ENABLED (1 << _DQUOT_USAGE_ENABLED * MAXQUOTAS) #define DQUOT_LIMITS_ENABLED (1 << _DQUOT_LIMITS_ENABLED * MAXQUOTAS) #define DQUOT_SUSPENDED (1 << _DQUOT_SUSPENDED * MAXQUOTAS) #define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \ DQUOT_SUSPENDED) /* Other quota flags */ #define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS) #define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST) /* Quota file is a special * system file and user cannot * touch it. Filesystem is * responsible for setting * S_NOQUOTA, S_NOATIME flags */ #define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1)) /* Allow negative quota usage */ /* Do not track dirty dquots in a list */ #define DQUOT_NOLIST_DIRTY (1 << (DQUOT_STATE_LAST + 2)) static inline unsigned int dquot_state_flag(unsigned int flags, int type) { return flags << type; } static inline unsigned int dquot_generic_flag(unsigned int flags, int type) { return (flags >> type) & DQUOT_STATE_FLAGS; } /* Bitmap of quota types where flag is set in flags */ static __always_inline unsigned dquot_state_types(unsigned flags, unsigned flag) { BUILD_BUG_ON_NOT_POWER_OF_2(flag); return (flags / flag) & ((1 << MAXQUOTAS) - 1); } #ifdef CONFIG_QUOTA_NETLINK_INTERFACE extern void quota_send_warning(struct kqid qid, dev_t dev, const char warntype); #else static inline void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { return; } #endif /* CONFIG_QUOTA_NETLINK_INTERFACE */ struct quota_info { unsigned int flags; /* Flags for diskquotas on this device */ struct rw_semaphore dqio_sem; /* Lock quota file while I/O in progress */ struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ const struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ }; int register_quota_format(struct quota_format_type *fmt); void unregister_quota_format(struct quota_format_type *fmt); struct quota_module_name { int qm_fmt_id; char *qm_mod_name; }; #define INIT_QUOTA_MODULE_NAMES {\ {QFMT_VFS_OLD, "quota_v1"},\ {QFMT_VFS_V0, "quota_v2"},\ {QFMT_VFS_V1, "quota_v2"},\ {0, NULL}} #endif /* _QUOTA_ */ |
883 2 882 879 879 5 3 5 882 684 684 684 449 449 684 680 5 1 5 5 108 106 45 878 883 874 874 874 850 848 874 1 93 92 42 39 38 37 35 5 35 38 42 15 15 15 972 972 972 972 972 346 4924 4924 4918 4926 4926 4926 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 | // SPDX-License-Identifier: GPL-2.0 /* * trace event based perf event profiling/tracing * * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/security.h> #include "trace.h" #include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses * surprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; /* Count the events in use (per event id, not per instance) */ static int total_ref_count; static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; if (tp_event->perf_perm) { ret = tp_event->perf_perm(tp_event, p_event); if (ret) return ret; } /* * We checked and allowed to create parent, * allow children without checking. */ if (p_event->parent) return 0; /* * It's ok to check current process (owner) permissions in here, * because code below is called only via perf_event_open syscall. */ /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { ret = perf_allow_tracepoint(&p_event->attr); if (ret) return ret; if (!is_sampling_event(p_event)) return 0; /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page * fault handler and its overall trickiness nature. */ if (!p_event->attr.exclude_callchain_user) return -EINVAL; /* * Same reason to disable user stack dump as for user space * callchains above. */ if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) return -EINVAL; } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; /* Some events are ok to be traced by non-root users... */ if (p_event->attach_state == PERF_ATTACH_TASK) { if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) return 0; } /* * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ ret = perf_allow_tracepoint(&p_event->attr); if (ret) return ret; return 0; } static int perf_trace_event_reg(struct trace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; list = alloc_percpu(struct hlist_head); if (!list) goto fail; for_each_possible_cpu(cpu) INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); tp_event->perf_events = list; if (!total_ref_count) { char __percpu *buf; int i; for (i = 0; i < PERF_NR_CONTEXTS; i++) { buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; perf_trace_buf[i] = buf; } } ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; total_ref_count++; return 0; fail: if (!total_ref_count) { int i; for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } } if (!--tp_event->perf_refcount) { free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; } return ret; } static void perf_trace_event_unreg(struct perf_event *p_event) { struct trace_event_call *tp_event = p_event->tp_event; int i; if (--tp_event->perf_refcount > 0) return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); /* * Ensure our callback won't be called anymore. The buffers * will be freed after that. */ tracepoint_synchronize_unregister(); free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; if (!--total_ref_count) { for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } } } static int perf_trace_event_open(struct perf_event *p_event) { struct trace_event_call *tp_event = p_event->tp_event; return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); } static void perf_trace_event_close(struct perf_event *p_event) { struct trace_event_call *tp_event = p_event->tp_event; tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); } static int perf_trace_event_init(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; ret = perf_trace_event_perm(tp_event, p_event); if (ret) return ret; ret = perf_trace_event_reg(tp_event, p_event); if (ret) return ret; ret = perf_trace_event_open(p_event); if (ret) { perf_trace_event_unreg(p_event); return ret; } return 0; } int perf_trace_init(struct perf_event *p_event) { struct trace_event_call *tp_event; u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && trace_event_try_get_ref(tp_event)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) trace_event_put_ref(tp_event); break; } } mutex_unlock(&event_mutex); return ret; } void perf_trace_destroy(struct perf_event *p_event) { mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } #ifdef CONFIG_KPROBE_EVENTS int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) { int ret; char *func = NULL; struct trace_event_call *tp_event; if (p_event->attr.kprobe_func) { func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), KSYM_NAME_LEN); if (IS_ERR(func)) { ret = PTR_ERR(func); return (ret == -EINVAL) ? -E2BIG : ret; } if (func[0] == '\0') { kfree(func); func = NULL; } } tp_event = create_local_trace_kprobe( func, (void *)(unsigned long)(p_event->attr.kprobe_addr), p_event->attr.probe_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; } mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_kprobe(tp_event); mutex_unlock(&event_mutex); out: kfree(func); return ret; } void perf_kprobe_destroy(struct perf_event *p_event) { mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS int perf_uprobe_init(struct perf_event *p_event, unsigned long ref_ctr_offset, bool is_retprobe) { int ret; char *path = NULL; struct trace_event_call *tp_event; if (!p_event->attr.uprobe_path) return -EINVAL; path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); return (ret == -EINVAL) ? -E2BIG : ret; } if (path[0] == '\0') { ret = -EINVAL; goto out; } tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, ref_ctr_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; } /* * local trace_uprobe need to hold event_mutex to call * uprobe_buffer_enable() and uprobe_buffer_disable(). * event_mutex is not required for local trace_kprobes. */ mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_uprobe(tp_event); mutex_unlock(&event_mutex); out: kfree(path); return ret; } void perf_uprobe_destroy(struct perf_event *p_event) { mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } #endif /* CONFIG_UPROBE_EVENTS */ int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; /* * If TRACE_REG_PERF_ADD returns false; no custom action was performed * and we need to take the default action of enqueueing our event on * the right per-cpu hlist. */ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { struct hlist_head __percpu *pcpu_list; struct hlist_head *list; pcpu_list = tp_event->perf_events; if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); } return 0; } void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; /* * If TRACE_REG_PERF_DEL returns false; no custom action was performed * and we need to take the default action of dequeueing our event from * the right per-cpu hlist. */ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) hlist_del_rcu(&p_event->hlist_entry); } void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { char *raw_data; int rctx; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "perf buffer not large enough, wanted %d, have %d", size, PERF_MAX_TRACE_SIZE)) return NULL; *rctxp = rctx = perf_swevent_get_recursion_context(); if (rctx < 0) return NULL; if (regs) *regs = this_cpu_ptr(&__perf_regs[rctx]); raw_data = this_cpu_ptr(perf_trace_buf[rctx]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); NOKPROBE_SYMBOL(perf_trace_buf_alloc); void perf_trace_buf_update(void *record, u16 type) { struct trace_entry *entry = record; tracing_generic_entry_update(entry, type, tracing_gen_ctx()); } NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_entry *entry; struct perf_event *event; struct hlist_head head; struct pt_regs regs; int rctx; int bit; if (!rcu_is_watching()) return; bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; if ((unsigned long)ops->private != smp_processor_id()) goto out; event = container_of(ops, struct perf_event, ftrace_ops); /* * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all * the perf code does is hlist_for_each_entry_rcu(), so we can * get away with simply setting the @head.first pointer in order * to create a singular list. */ head.first = &event->hlist_entry; #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); memset(®s, 0, sizeof(regs)); perf_fetch_caller_regs(®s); entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) goto out; entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, &head, NULL); out: ftrace_test_recursion_unlock(bit); #undef ENTRY_SIZE } static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; return register_ftrace_function(ops); } static int perf_ftrace_function_unregister(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; int ret = unregister_ftrace_function(ops); ftrace_free_filter(ops); return ret; } int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { struct perf_event *event = data; switch (type) { case TRACE_REG_REGISTER: case TRACE_REG_UNREGISTER: break; case TRACE_REG_PERF_REGISTER: case TRACE_REG_PERF_UNREGISTER: return 0; case TRACE_REG_PERF_OPEN: return perf_ftrace_function_register(data); case TRACE_REG_PERF_CLOSE: return perf_ftrace_function_unregister(data); case TRACE_REG_PERF_ADD: event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); return 1; case TRACE_REG_PERF_DEL: event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; return 1; } return -EINVAL; } #endif /* CONFIG_FUNCTION_TRACER */ |
7149 7154 7150 74 74 67 557 135 5 595 168 429 169 149 20 3 431 421 415 50 365 5 314 96 266 19 245 3 242 16 227 8 219 6 212 204 25 187 2 142 9 139 36 103 1 101 2 100 5 95 2 93 3 90 88 3 87 87 10 77 78 66 765 767 5 762 691 761 207 762 5 153 750 750 557 557 557 38 31 557 554 553 83 82 83 5 5 480 480 480 10 18 23 8 40 32 18 98 31 40 30 6 521 425 8 1 558 559 160 1050 1008 160 4710 4712 1 1 1 4710 4713 1 19 18 8 4 24 24 23 23 23 7 17 908 907 18 5 892 888 851 889 887 28 888 887 867 647 832 832 832 831 92 832 687 687 654 152 26 27 27 17 557 557 841 135 290 1062 3 13 1044 238 6 1 6 12 5 5 5 5 41 20 207 32 67 53 66 62 61 2 59 57 1 52 55 218 217 211 208 205 2 204 200 192 192 193 67 67 63 61 1 57 2 55 58 69 61 60 58 3 56 59 40 56 113 24 23 2 21 24 20 20 19 17 20 20 245 244 2 242 243 55 242 241 242 240 234 51 51 99 95 98 1 96 97 37 96 96 96 95 95 95 95 1 1 94 94 93 94 94 33 33 34 35 35 32 31 30 3 2 2 27 25 24 7 17 15 22 24 25 13 10 9 8 7 7 8 4190 4192 4158 4158 2592 2594 2593 32 32 1193 1224 1224 35 2 33 32 32 740 32 707 12 4 95 95 2600 2601 1550 1670 1510 1514 15 15 1512 85 1513 150 61 4308 123 109 61 59 7 5 5 188 192 4303 122 58 131 34 28 60 2 46 4155 4153 4375 4374 4374 4374 4354 2297 4352 68 1 1 4304 4 4 195 9 10 4298 4300 4300 4238 4200 4198 25 4193 5 16 56 4157 3593 2595 1 3588 4299 213 213 3811 31 30 8 29 166 163 160 30 11 11 37 37 26 36 2 7 37 37 36 367 368 366 367 367 368 2 4 76 11 26 28 4 1 3 3 3 8 7 8 2 1 348 346 345 345 339 1 338 347 4 381 348 133 97 4 26 26 24 97 1 3 48 3 17 44 60 58 56 56 54 9 3 25 25 16 54 46 5 4 41 8 5 27 3 37 36 35 16 5 8 4 1017 974 1013 981 1010 988 990 63 62 61 61 63 26 15 3 23 22 45 43 39 39 5 3 3 3 3 2 11 14 14 14 14 14 14 11 3 85 97 96 95 103 103 101 101 12 9 12 12 98 95 94 14 14 88 1 87 15 15 72 87 5 5 1 2 1 3 84 1 29 29 8 8 21 83 20 26 26 79 7 19 82 3 82 82 82 3 1 82 1 3 3 2 2 2 2 82 2 26 4 4 4 83 13 1 13 1 13 12 8 9 8 8 8 8 23 4 174 155 103 53 13 40 9 31 10 485 27 27 13 13 13 10 8 6 2 2 11 11 13 11 81 80 80 79 53 52 27 12 12 15 14 1 14 27 420 244 419 398 177 397 395 290 64 262 246 88 50 42 2 41 40 38 2 2 31 3 2 2 1 1 1 41 11 9 7 6 2 4 2 1 2 3 2 1 5 9 5 5 2 8 7 1 53 53 52 53 54 54 44 43 1 11 10 9 8 3 4 1 7 7 7151 6215 7152 7151 7148 907 218 4374 24 5 31 176 489 119 68 256 27 768 3 7152 2 2 4 29 1 24 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> #include <linux/bsearch.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <net/netfilter/nf_bpf_link.h> #include <net/tcx.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); static DEFINE_IDR(link_idr); static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly = IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE }; /* * If we're handed a bigger struct than we know of, ensure all the unknown bits * are 0 - i.e. new user-space does not rely on any kernel feature extensions * we don't know about yet. * * There is a ToCToU between this function call and the following * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size) { int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; if (actual_size <= expected_size) return 0; if (uaddr.is_kernel) res = memchr_inv(uaddr.kernel + expected_size, 0, actual_size - expected_size) == NULL; else res = check_zeroed_user(uaddr.user + expected_size, actual_size - expected_size); if (res < 0) return res; return res ? 0 : -E2BIG; } const struct bpf_map_ops bpf_map_offload_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, .map_mem_usage = bpf_map_offload_map_mem_usage, }; static void bpf_map_write_active_inc(struct bpf_map *map) { atomic64_inc(&map->writecnt); } static void bpf_map_write_active_dec(struct bpf_map *map) { atomic64_dec(&map->writecnt); } bool bpf_map_write_active(const struct bpf_map *map) { return atomic64_read(&map->writecnt) != 0; } static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); else return map->value_size; } static void maybe_wait_bpf_programs(struct bpf_map *map) { /* Wait for any running BPF programs to complete so that * userspace, when we return to it, knows that all programs * that could be running use the new map value. */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) synchronize_rcu(); } static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { int err; /* Need to create a kthread, thus must support schedule */ if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || map->map_type == BPF_MAP_TYPE_SOCKMAP) { return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { return bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_update(map, key, value, flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { rcu_read_lock(); err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK || map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, flags); rcu_read_unlock(); } bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; } static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, __u64 flags) { void *ptr; int err; if (bpf_map_is_offloaded(map)) return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK || map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_peek_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* struct_ops map requires directly updating "value" */ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) ptr = map->ops->map_lookup_elem_sys_only(map, key); else ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { err = -ENOENT; } else { err = 0; if (flags & BPF_F_LOCK) /* lock 'ptr' and copy everything but lock */ copy_map_value_locked(map, value, ptr, true); else copy_map_value(map, value, ptr); /* mask lock and timer, since value wasn't zero inited */ check_and_init_map_value(map, value); } rcu_read_unlock(); } bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; } /* Please, do not use this function outside from the map creation path * (e.g. in map update path) without taking care of setting the active * memory cgroup (see at bpf_map_kmalloc_node() for example). */ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, * which is used for lower order allocation requests. * * It has been observed that higher order allocation requests done by * vmalloc with __GFP_NORETRY being set might fail due to not trying * to reclaim memory from the page cache, thus we set * __GFP_RETRY_MAYFAIL to avoid such situations. */ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); unsigned int flags = 0; unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ if (mmapable) { BUG_ON(!PAGE_ALIGNED(size)); align = SHMLBA; flags = VM_USERMAP; } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, false); } void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, true); } void bpf_map_area_free(void *area) { kvfree(area); } static u32 bpf_map_flags_retain_permanent(u32 flags) { /* Some map creation flags are not tied to the map object but * rather to the map fd instead, so they have no meaning upon * map object inspection since multiple file descriptors with * different (access) properties can exist here. Thus, given * this has zero meaning for the map itself, lets clear these * from here. */ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); } void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); map->map_extra = attr->map_extra; } static int bpf_map_alloc_id(struct bpf_map *map) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } void bpf_map_free_id(struct bpf_map *map) { unsigned long flags; /* Offloaded maps are removed from the IDR store when their device * disappears - even if someone holds an fd to them they are unusable, * the memory is gone, all ops will fail; they are simply waiting for * refcnt to drop to be freed. */ if (!map->id) return; spin_lock_irqsave(&map_idr_lock, flags); idr_remove(&map_idr, map->id); map->id = 0; spin_unlock_irqrestore(&map_idr_lock, flags); } #ifdef CONFIG_MEMCG_KMEM static void bpf_map_save_memcg(struct bpf_map *map) { /* Currently if a map is created by a process belonging to the root * memory cgroup, get_obj_cgroup_from_current() will return NULL. * So we have to check map->objcg for being NULL each time it's * being used. */ if (memcg_bpf_enabled()) map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) { if (map->objcg) obj_cgroup_put(map->objcg); } static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) { if (map->objcg) return get_mem_cgroup_from_objcg(map->objcg); return root_mem_cgroup; } void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } #else static void bpf_map_save_memcg(struct bpf_map *map) { } static void bpf_map_release_memcg(struct bpf_map *map) { } #endif static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; if (f1->offset < f2->offset) return -1; else if (f1->offset > f2->offset) return 1; return 0; } struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, u32 field_mask) { struct btf_field *field; if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) return NULL; field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); if (!field || !(field->type & field_mask)) return NULL; return field; } void btf_record_free(struct btf_record *rec) { int i; if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: /* Nothing to release */ break; default: WARN_ON_ONCE(1); continue; } } kfree(rec); } void bpf_map_free_record(struct bpf_map *map) { btf_record_free(map->record); map->record = NULL; } struct btf_record *btf_record_dup(const struct btf_record *rec) { const struct btf_field *fields; struct btf_record *new_rec; int ret, size, i; if (IS_ERR_OR_NULL(rec)) return NULL; size = offsetof(struct btf_record, fields[rec->cnt]); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); /* Do a deep copy of the btf_record */ fields = rec->fields; new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; goto free; } break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: /* Nothing to acquire */ break; default: ret = -EFAULT; WARN_ON_ONCE(1); goto free; } new_rec->cnt++; } return new_rec; free: btf_record_free(new_rec); return ERR_PTR(ret); } bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; if (!a_has_fields && !b_has_fields) return true; if (a_has_fields != b_has_fields) return false; if (rec_a->cnt != rec_b->cnt) return false; size = offsetof(struct btf_record, fields[rec_a->cnt]); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. * * While spin_lock, timer, and kptr have no relation to map BTF, * list_head metadata is specific to map BTF, the btf and value_rec * members in particular. btf is the map BTF, while value_rec points to * btf_record in that map BTF. * * So while by default, we don't rely on the map BTF (which the records * were parsed from) matching for both records, which is not backwards * compatible, in case list_head is part of it, we implicitly rely on * that by way of depending on memcmp succeeding for it. */ return !memcmp(rec_a, rec_b, size); } void bpf_obj_free_timer(const struct btf_record *rec, void *obj) { if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) return; bpf_timer_cancel_and_free(obj + rec->timer_off); } extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; int i; if (IS_ERR_OR_NULL(rec)) return; fields = rec->fields; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *pointee_struct_meta; const struct btf_field *field = &fields[i]; void *field_ptr = obj + field->offset; void *xchgd_field; switch (fields[i].type) { case BPF_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); if (!xchgd_field) break; if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL); migrate_enable(); } else { field->kptr.dtor(xchgd_field); } break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); break; case BPF_RB_ROOT: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); break; case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: break; default: WARN_ON_ONCE(1); continue; } } } /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. * * Note that the btf_record stashed in map->inner_map_meta->record was * already freed using the map_free callback for map in map case which * eventually calls bpf_map_free_meta, since inner_map_meta is only a * template bpf_map struct used during verification. */ btf_record_free(rec); } static void bpf_map_put_uref(struct bpf_map *map) { if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } } /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ void bpf_map_put(struct bpf_map *map) { if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); /* Avoid spawning kworkers, since they all might contend * for the same mutex like slab_mutex. */ queue_work(system_unbound_wq, &map->work); } } EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); bpf_map_put(map); } static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; if (map->ops->map_release) map->ops->map_release(map, filp); bpf_map_put_with_uref(map); return 0; } static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) { fmode_t mode = f.file->f_mode; /* Our file permissions may have been overridden by global * map permissions facing syscall side. */ if (READ_ONCE(map->frozen)) mode &= ~FMODE_CAN_WRITE; return mode; } #ifdef CONFIG_PROC_FS /* Show the memory usage of a bpf map */ static u64 bpf_map_memory_usage(const struct bpf_map *map) { return map->ops->map_mem_usage(map); } static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; if (map_type_contains_progs(map)) { spin_lock(&map->owner.lock); type = map->owner.type; jited = map->owner.jited; spin_unlock(&map->owner.lock); } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" "map_extra:\t%#llx\n" "memlock:\t%llu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, (unsigned long long)map->map_extra, bpf_map_memory_usage(map), map->id, READ_ONCE(map->frozen)); if (type) { seq_printf(m, "owner_prog_type:\t%u\n", type); seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { /* We need this handler such that alloc_file() enables * f_mode with FMODE_CAN_READ. */ return -EINVAL; } static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, size_t siz, loff_t *ppos) { /* We need this handler such that alloc_file() enables * f_mode with FMODE_CAN_WRITE. */ return -EINVAL; } /* called for any extra memory-mapped regions (except initial) */ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { .open = bpf_map_mmap_open, .close = bpf_map_mmap_close, }; static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; int err; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; mutex_lock(&map->freeze_mutex); if (vma->vm_flags & VM_WRITE) { if (map->frozen) { err = -EPERM; goto out; } /* map is meant to be read-only, so do not allow mapping as * writable, because it's possible to leak a writable page * reference and allows user-space to still modify it after * freezing, while verifier will assume contents do not change */ if (map->map_flags & BPF_F_RDONLY_PROG) { err = -EACCES; goto out; } } /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) goto out; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); return err; } static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) { struct bpf_map *map = filp->private_data; if (map->ops->map_poll) return map->ops->map_poll(map, filp, pts); return EPOLLERR; } const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif .release = bpf_map_release, .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) { int ret; ret = security_bpf_map(map, OPEN_FMODE(flags)); if (ret < 0) return ret; return anon_inode_getfd("bpf-map", &bpf_map_fops, map, flags | O_CLOEXEC); } int bpf_get_file_flag(int flags) { if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) return -EINVAL; if (flags & BPF_F_RDONLY) return O_RDONLY; if (flags & BPF_F_WRONLY) return O_WRONLY; return O_RDWR; } /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ sizeof(attr->CMD##_LAST_FIELD), 0, \ sizeof(*attr) - \ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL /* dst and src must have at least "size" number of bytes. * Return strlen on success and < 0 on error. */ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { const char *end = src + size; const char *orig_src = src; memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return -ENOTSUPP; } static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; int ret = 0; /* Some maps allow key to be unspecified. */ if (btf_key_id) { key_type = btf_type_id_size(btf, &btf_key_id, &key_size); if (!key_type || key_size != map->key_size) return -EINVAL; } else { key_type = btf_type_by_id(btf, 0); if (!map->ops->map_check_btf) return -EINVAL; } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) return -EINVAL; map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; if (!bpf_capable()) { ret = -EPERM; goto free_map_tab; } if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { ret = -EACCES; goto free_map_tab; } for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { switch (map->record->field_mask & (1 << i)) { case 0: continue; case BPF_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && map->map_type != BPF_MAP_TYPE_INODE_STORAGE && map->map_type != BPF_MAP_TYPE_TASK_STORAGE && map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_TIMER: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && map->map_type != BPF_MAP_TYPE_SK_STORAGE && map->map_type != BPF_MAP_TYPE_INODE_STORAGE && map->map_type != BPF_MAP_TYPE_TASK_STORAGE && map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; goto free_map_tab; } break; default: /* Fail if map_type checks are missing for a field type */ ret = -EOPNOTSUPP; goto free_map_tab; } } } ret = btf_check_and_fixup_fields(btf, map->record); if (ret < 0) goto free_map_tab; if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); if (ret < 0) goto free_map_tab; } return ret; free_map_tab: bpf_map_free_record(map); return ret; } #define BPF_MAP_CREATE_LAST_FIELD map_extra /* called via syscall */ static int map_create(union bpf_attr *attr) { const struct bpf_map_ops *ops; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; struct bpf_map *map; int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; if (attr->btf_vmlinux_value_type_id) { if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || attr->btf_key_type_id || attr->btf_value_type_id) return -EINVAL; } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { return -EINVAL; } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_extra != 0) return -EINVAL; f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; if (map_type >= ARRAY_SIZE(bpf_map_types)) return -EINVAL; map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; if (!ops) return -EINVAL; if (ops->map_alloc_check) { err = ops->map_alloc_check(attr); if (err) return err; } if (attr->map_ifindex) ops = &bpf_map_offload_ops; if (!ops->map_mem_usage) return -EINVAL; /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out. */ if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; /* check privileged map type permissions */ switch (map_type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_PROG_ARRAY: case BPF_MAP_TYPE_PERF_EVENT_ARRAY: case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_CGROUP_STORAGE: case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: /* unprivileged */ break; case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: case BPF_MAP_TYPE_CGRP_STORAGE: case BPF_MAP_TYPE_BLOOM_FILTER: case BPF_MAP_TYPE_LPM_TRIE: case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: case BPF_MAP_TYPE_STACK_TRACE: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: if (!bpf_capable()) return -EPERM; break; case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: WARN(1, "unsupported map type %d", map_type); return -EPERM; } map = ops->map_alloc(attr); if (IS_ERR(map)) return PTR_ERR(map); map->ops = ops; map->map_type = map_type; err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with * to figure out the corresponding kernel's * counter part. Thus, attr->btf_fd has * to be valid also. */ attr->btf_vmlinux_value_type_id) { struct btf *btf; btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } if (btf_is_kernel(btf)) { btf_put(btf); err = -EACCES; goto free_map; } map->btf = btf; if (attr->btf_value_type_id) { err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; } map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; map->btf_vmlinux_value_type_id = attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); if (err) goto free_map; err = bpf_map_alloc_id(map); if (err) goto free_map_sec; bpf_map_save_memcg(map); err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ bpf_map_put_with_uref(map); return err; } return err; free_map_sec: security_bpf_map_free(map); free_map: btf_put(map->btf); map->ops->map_free(map); return err; } /* if error is returned, fd is released. * On success caller should complete fd access with matching fdput() */ struct bpf_map *__bpf_map_get(struct fd f) { if (!f.file) return ERR_PTR(-EBADF); if (f.file->f_op != &bpf_map_fops) { fdput(f); return ERR_PTR(-EINVAL); } return f.file->private_data; } void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); } EXPORT_SYMBOL_GPL(bpf_map_inc); void bpf_map_inc_with_uref(struct bpf_map *map) { atomic64_inc(&map->refcnt); atomic64_inc(&map->usercnt); } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); struct bpf_map *bpf_map_get(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; map = __bpf_map_get(f); if (IS_ERR(map)) return map; bpf_map_inc(map); fdput(f); return map; } EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; map = __bpf_map_get(f); if (IS_ERR(map)) return map; bpf_map_inc_with_uref(map); fdput(f); return map; } /* map_idr_lock should have been held or the map should have been * protected by rcu read lock. */ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); if (uref) atomic64_inc(&map->usercnt); return map; } struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { spin_lock_bh(&map_idr_lock); map = __bpf_map_inc_not_zero(map, false); spin_unlock_bh(&map_idr_lock); return map; } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; } static void *__bpf_copy_key(void __user *ukey, u64 key_size) { if (key_size) return vmemdup_user(ukey, key_size); if (ukey) return ERR_PTR(-EINVAL); return NULL; } static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) { if (key_size) return kvmemdup_bpfptr(ukey, key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); return NULL; } /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; if (attr->flags & ~BPF_F_LOCK) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } if ((attr->flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } value_size = bpf_map_value_size(map); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { if (copy_from_user(value, uvalue, value_size)) err = -EFAULT; else err = bpf_map_copy_value(map, key, value, attr->flags); goto free_value; } err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; err = -EFAULT; if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; free_value: kvfree(value); free_key: kvfree(key); err_put: fdput(f); return err; } #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } if ((attr->flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } value_size = bpf_map_value_size(map); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); goto free_key; } err = bpf_map_update_value(map, f.file, key, value, attr->flags); kvfree(value); free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); fdput(f); return err; } #define BPF_MAP_DELETE_ELEM_LAST_FIELD key static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; } else if (IS_FD_PROG_ARRAY(map) || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: kvfree(key); err_put: bpf_map_write_active_dec(map); fdput(f); return err; } /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key static int map_get_next_key(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *unext_key = u64_to_user_ptr(attr->next_key); int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } if (ukey) { key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } } else { key = NULL; } err = -ENOMEM; next_key = kvmalloc(map->key_size, GFP_USER); if (!next_key) goto free_key; if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_get_next_key(map, key, next_key); goto out; } rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); out: if (err) goto free_next_key; err = -EFAULT; if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; err = 0; free_next_key: kvfree(next_key); free_key: kvfree(key); err_put: fdput(f); return err; } int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 cp, max_count; int err = 0; void *key; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } max_count = attr->batch.count; if (!max_count) return 0; key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, map->key_size)) break; if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); break; } bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); if (err) break; cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; kvfree(key); maybe_wait_bpf_programs(map); return err; } int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 value_size, cp, max_count; void *key, *value; int err = 0; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } value_size = bpf_map_value_size(map); max_count = attr->batch.count; if (!max_count) return 0; key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { kvfree(key); return -ENOMEM; } for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, map->key_size) || copy_from_user(value, values + cp * value_size, value_size)) break; err = bpf_map_update_value(map, map_file, key, value, attr->batch.elem_flags); if (err) break; cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; kvfree(value); kvfree(key); return err; } #define MAP_LOOKUP_RETRIES 3 int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); max_count = attr->batch.count; if (!max_count) return 0; if (put_user(0, &uattr->batch.count)) return -EFAULT; buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!buf_prevkey) return -ENOMEM; buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { kvfree(buf_prevkey); return -ENOMEM; } err = -EFAULT; prev_key = NULL; if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) goto free_buf; key = buf; value = key + map->key_size; if (ubatch) prev_key = buf_prevkey; for (cp = 0; cp < max_count;) { rcu_read_lock(); err = map->ops->map_get_next_key(map, prev_key, key); rcu_read_unlock(); if (err) break; err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); if (err == -ENOENT) { if (retry) { retry--; continue; } err = -EINTR; break; } if (err) goto free_buf; if (copy_to_user(keys + cp * map->key_size, key, map->key_size)) { err = -EFAULT; goto free_buf; } if (copy_to_user(values + cp * value_size, value, value_size)) { err = -EFAULT; goto free_buf; } if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; cond_resched(); } if (err == -EFAULT) goto free_buf; if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || (cp && copy_to_user(uobatch, prev_key, map->key_size)))) err = -EFAULT; free_buf: kvfree(buf_prevkey); kvfree(buf); return err; } #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags static int map_lookup_and_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) return -EINVAL; if (attr->flags & ~BPF_F_LOCK) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } if (attr->flags && (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK)) { err = -EINVAL; goto err_put; } if ((attr->flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } value_size = bpf_map_value_size(map); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -ENOTSUPP; if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_pop_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); rcu_read_unlock(); bpf_enable_instrumentation(); } } if (err) goto free_value; if (copy_to_user(uvalue, value, value_size) != 0) { err = -EFAULT; goto free_value; } err = 0; free_value: kvfree(value); free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); fdput(f); return err; } #define BPF_MAP_FREEZE_LAST_FIELD map_fd static int map_freeze(const union bpf_attr *attr) { int err = 0, ufd = attr->map_fd; struct bpf_map *map; struct fd f; if (CHECK_ATTR(BPF_MAP_FREEZE)) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { fdput(f); return -EPERM; } mutex_lock(&map->freeze_mutex); if (bpf_map_write_active(map)) { err = -EBUSY; goto err_put; } if (READ_ONCE(map->frozen)) { err = -EBUSY; goto err_put; } WRITE_ONCE(map->frozen, true); err_put: mutex_unlock(&map->freeze_mutex); fdput(f); return err; } static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { const struct bpf_prog_ops *ops; if (type >= ARRAY_SIZE(bpf_prog_types)) return -EINVAL; type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); ops = bpf_prog_types[type]; if (!ops) return -EINVAL; if (!bpf_prog_is_offloaded(prog->aux)) prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; return 0; } enum bpf_audit { BPF_AUDIT_LOAD, BPF_AUDIT_UNLOAD, BPF_AUDIT_MAX, }; static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { [BPF_AUDIT_LOAD] = "LOAD", [BPF_AUDIT_UNLOAD] = "UNLOAD", }; static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) { struct audit_context *ctx = NULL; struct audit_buffer *ab; if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) return; if (audit_enabled == AUDIT_OFF) return; if (!in_irq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) return; audit_log_format(ab, "prog-id=%u op=%s", prog->aux->id, bpf_audit_str[op]); audit_log_end(ab); } static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } void bpf_prog_free_id(struct bpf_prog *prog) { unsigned long flags; /* cBPF to eBPF migrations are currently not in the idr store. * Offloaded programs are removed from the store when their device * disappears - even if someone grabs an fd to them they are unusable, * simply waiting for refcnt to drop to be freed. */ if (!prog->aux->id) return; spin_lock_irqsave(&prog_idr_lock, flags); idr_remove(&prog_idr, prog->aux->id); prog->aux->id = 0; spin_unlock_irqrestore(&prog_idr_lock, flags); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); kvfree(aux->func_info); kfree(aux->func_info_aux); free_uid(aux->user); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); module_put(prog->aux->mod); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); if (deferred) { if (prog->aux->sleepable) call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); else call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } else { __bpf_prog_put_rcu(&prog->aux->rcu); } } static void bpf_prog_put_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; struct bpf_prog *prog; aux = container_of(work, struct bpf_prog_aux, work); prog = aux->prog; perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); bpf_prog_free_id(prog); __bpf_prog_put_noref(prog, true); } static void __bpf_prog_put(struct bpf_prog *prog) { struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { if (in_irq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { bpf_prog_put_deferred(&aux->work); } } } void bpf_prog_put(struct bpf_prog *prog) { __bpf_prog_put(prog); } EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; bpf_prog_put(prog); return 0; } struct bpf_prog_kstats { u64 nsecs; u64 cnt; u64 misses; }; void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; unsigned int flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); u64_stats_update_end_irqrestore(&stats->syncp, flags); } static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; u64 tnsecs, tcnt, tmisses; st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin(&st->syncp); tnsecs = u64_stats_read(&st->nsecs); tcnt = u64_stats_read(&st->cnt); tmisses = u64_stats_read(&st->misses); } while (u64_stats_fetch_retry(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; stats->misses = misses; } #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; struct bpf_prog_kstats stats; bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" "run_cnt:\t%llu\n" "recursion_misses:\t%llu\n" "verified_insns:\t%u\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, stats.cnt, stats.misses, prog->aux->verified_insns); } #endif const struct file_operations bpf_prog_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_prog_show_fdinfo, #endif .release = bpf_prog_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; int bpf_prog_new_fd(struct bpf_prog *prog) { int ret; ret = security_bpf_prog(prog); if (ret < 0) return ret; return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); } static struct bpf_prog *____bpf_prog_get(struct fd f) { if (!f.file) return ERR_PTR(-EBADF); if (f.file->f_op != &bpf_prog_fops) { fdput(f); return ERR_PTR(-EINVAL); } return f.file->private_data; } void bpf_prog_add(struct bpf_prog *prog, int i) { atomic64_add(i, &prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_add); void bpf_prog_sub(struct bpf_prog *prog, int i) { /* Only to be used for undoing previous bpf_prog_add() in some * error path. We still know that another entity in our call * path holds a reference to the program, thus atomic_sub() can * be safely used in such cases! */ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); } EXPORT_SYMBOL_GPL(bpf_prog_sub); void bpf_prog_inc(struct bpf_prog *prog) { atomic64_inc(&prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_inc); /* prog_idr_lock should have been held */ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); return prog; } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ if (!attach_type) return true; if (prog->type != *attach_type) return false; if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) return false; return true; } static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, bool attach_drv) { struct fd f = fdget(ufd); struct bpf_prog *prog; prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { prog = ERR_PTR(-EINVAL); goto out; } bpf_prog_inc(prog); out: fdput(f); return prog; } struct bpf_prog *bpf_prog_get(u32 ufd) { return __bpf_prog_get(ufd, NULL, false); } struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* Initially all BPF programs could be loaded w/o specifying * expected_attach_type. Later for some of them specifying expected_attach_type * at load time became required so that program could be validated properly. * Programs of types that are allowed to be loaded both w/ and w/o (for * backward compatibility) expected_attach_type, should have the default attach * type assigned to expected_attach_type for the latter case, so that it can be * validated later at attach time. * * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if * prog type requires it but has some attach types that have to be backward * compatible. */ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) { switch (attr->prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't * exist so checking for non-zero is the way to go here. */ if (!attr->expected_attach_type) attr->expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE; break; case BPF_PROG_TYPE_SK_REUSEPORT: if (!attr->expected_attach_type) attr->expected_attach_type = BPF_SK_REUSEPORT_SELECT; break; } } static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, struct btf *attach_btf, u32 btf_id, struct bpf_prog *dst_prog) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; if (!attach_btf && !dst_prog) return -EINVAL; switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: break; default: return -EINVAL; } } if (attach_btf && (!btf_id || dst_prog)) return -EINVAL; if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_CGROUP_SKB: switch (expected_attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_CGROUP_SOCKOPT: switch (expected_attach_type) { case BPF_CGROUP_SETSOCKOPT: case BPF_CGROUP_GETSOCKOPT: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_SK_LOOKUP: if (expected_attach_type == BPF_SK_LOOKUP) return 0; return -EINVAL; case BPF_PROG_TYPE_SK_REUSEPORT: switch (expected_attach_type) { case BPF_SK_REUSEPORT_SELECT: case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_NETFILTER: if (expected_attach_type == BPF_NETFILTER) return 0; return -EINVAL; case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; fallthrough; default: return 0; } } static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) { switch (prog_type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ case BPF_PROG_TYPE_NETFILTER: return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ case BPF_PROG_TYPE_SK_REUSEPORT: /* equivalent to SOCKET_FILTER. need CAP_BPF only */ default: return false; } } static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) { switch (prog_type) { case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ return true; default: return false; } } /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD log_true_size static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; int err; char license[128]; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && !bpf_capable()) return -EPERM; /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out for these * and other operations. */ if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !bpf_capable()) return -EPERM; if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ if (attr->attach_prog_fd) { dst_prog = bpf_prog_get(attr->attach_prog_fd); if (IS_ERR(dst_prog)) { dst_prog = NULL; attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); if (IS_ERR(attach_btf)) return -EINVAL; if (!btf_is_kernel(attach_btf)) { /* attaching through specifying bpf_prog's BTF * objects directly might be supported eventually */ btf_put(attach_btf); return -ENOTSUPP; } } } else if (attr->attach_btf_id) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); if (IS_ERR(attach_btf)) return PTR_ERR(attach_btf); if (!attach_btf) return -EINVAL; btf_get(attach_btf); } bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, dst_prog)) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); return -EINVAL; } /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); return -ENOMEM; } prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog; prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; err = -EFAULT; if (copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) goto free_prog_sec; /* copy eBPF program license from user space */ if (strncpy_from_bpfptr(license, make_bpfptr(attr->license, uattr.is_kernel), sizeof(license) - 1) < 0) goto free_prog_sec; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); if (err) goto free_prog_sec; } if (type == BPF_PROG_TYPE_EXT && dst_prog && bpf_prog_is_dev_bound(dst_prog->aux)) { err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) goto free_prog_sec; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) goto free_prog_sec; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) goto free_prog_sec; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr, uattr_size); if (err < 0) goto free_used_maps; prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; /* Upon success of bpf_prog_alloc_id(), the BPF prog is * effectively publicly exposed. However, retrieving via * bpf_prog_get_fd_by_id() will take another reference, * therefore it cannot be gone underneath us. * * Only for the time /after/ successful bpf_prog_new_fd() * and before returning to userspace, we might just hold * one reference and any parallel close on that fd could * rip everything out. Hence, below notifications must * happen before bpf_prog_new_fd(). * * Also, any failure handling from this point onwards must * be using bpf_prog_put() given the program is exposed. */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) bpf_prog_put(prog); return err; free_used_maps: /* In case we have subprogs, we need to wait for a grace * period before we can tear down JIT memory since symbols * are already exposed under kallsyms. */ __bpf_prog_put_noref(prog, prog->aux->func_cnt); return err; free_prog_sec: free_uid(prog->aux->user); security_bpf_prog_free(prog->aux); free_prog: if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); return err; } #define BPF_OBJ_LAST_FIELD path_fd static int bpf_obj_pin(const union bpf_attr *attr) { int path_fd; if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) return -EINVAL; /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; return bpf_obj_pin_user(attr->bpf_fd, path_fd, u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) { int path_fd; if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) return -EINVAL; /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), attr->file_flags); } void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); link->type = type; link->id = 0; link->ops = ops; link->prog = prog; } static void bpf_link_free_id(int id) { if (!id) return; spin_lock_bh(&link_idr_lock); idr_remove(&link_idr, id); spin_unlock_bh(&link_idr_lock); } /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred * anon_inode's release() call. This helper marks bpf_link as * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt * is not decremented, it's the responsibility of a calling code that failed * to complete bpf_link initialization. * This helper eventually calls link's dealloc callback, but does not call * link's release callback. */ void bpf_link_cleanup(struct bpf_link_primer *primer) { primer->link->prog = NULL; bpf_link_free_id(primer->id); fput(primer->file); put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) { atomic64_inc(&link->refcnt); } /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } /* free bpf_link and its containing memory */ link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) { struct bpf_link *link = container_of(work, struct bpf_link, work); bpf_link_free(link); } /* bpf_link_put might be called from atomic context. It needs to be called * from sleepable context in order to acquire sleeping locks during the process. */ void bpf_link_put(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; INIT_WORK(&link->work, bpf_link_put_deferred); schedule_work(&link->work); } EXPORT_SYMBOL(bpf_link_put); static void bpf_link_put_direct(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; bpf_link_free(link); } static int bpf_link_release(struct inode *inode, struct file *filp) { struct bpf_link *link = filp->private_data; bpf_link_put_direct(link); return 0; } #ifdef CONFIG_PROC_FS #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, static const char *bpf_link_type_strs[] = { [BPF_LINK_TYPE_UNSPEC] = "<invalid>", #include <linux/bpf_types.h> }; #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; seq_printf(m, "link_type:\t%s\n" "link_id:\t%u\n", bpf_link_type_strs[link->type], link->id); if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_tag:\t%s\n" "prog_id:\t%u\n", prog_tag, prog->aux->id); } if (link->ops->show_fdinfo) link->ops->show_fdinfo(link, m); } #endif static const struct file_operations bpf_link_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_link_show_fdinfo, #endif .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; static int bpf_link_alloc_id(struct bpf_link *link) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&link_idr_lock); id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); spin_unlock_bh(&link_idr_lock); idr_preload_end(); return id; } /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, * reserving unused FD and allocating ID from link_idr. This is to be paired * with bpf_link_settle() to install FD and ID and expose bpf_link to * user-space, if bpf_link is successfully attached. If not, bpf_link and * pre-allocated resources are to be freed with bpf_cleanup() call. All the * transient state is passed around in struct bpf_link_primer. * This is preferred way to create and initialize bpf_link, especially when * there are complicated and expensive operations in between creating bpf_link * itself and attaching it to BPF hook. By using bpf_link_prime() and * bpf_link_settle() kernel code using bpf_link doesn't have to perform * expensive (and potentially failing) roll back operations in a rare case * that file, FD, or ID can't be allocated. */ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; id = bpf_link_alloc_id(link); if (id < 0) { put_unused_fd(fd); return id; } file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { bpf_link_free_id(id); put_unused_fd(fd); return PTR_ERR(file); } primer->link = link; primer->file = file; primer->fd = fd; primer->id = id; return 0; } int bpf_link_settle(struct bpf_link_primer *primer) { /* make bpf_link fetchable by ID */ spin_lock_bh(&link_idr_lock); primer->link->id = primer->id; spin_unlock_bh(&link_idr_lock); /* make bpf_link fetchable by FD */ fd_install(primer->fd, primer->file); /* pass through installed FD */ return primer->fd; } int bpf_link_new_fd(struct bpf_link *link) { return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) { struct fd f = fdget(ufd); struct bpf_link *link; if (!f.file) return ERR_PTR(-EBADF); if (f.file->f_op != &bpf_link_fops) { fdput(f); return ERR_PTR(-EINVAL); } link = f.file->private_data; bpf_link_inc(link); fdput(f); return link; } EXPORT_SYMBOL(bpf_link_get_from_fd); static void bpf_tracing_link_release(struct bpf_link *link) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); bpf_trampoline_put(tr_link->trampoline); /* tgt_prog is NULL if target is a kernel function */ if (tr_link->tgt_prog) bpf_prog_put(tr_link->tgt_prog); } static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); u32 target_btf_id, target_obj_id; bpf_trampoline_unpack_key(tr_link->trampoline->key, &target_obj_id, &target_btf_id); seq_printf(seq, "attach_type:\t%d\n" "target_obj_id:\t%u\n" "target_btf_id:\t%u\n", tr_link->attach_type, target_obj_id, target_btf_id); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); return 0; } static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, .show_fdinfo = bpf_tracing_link_show_fdinfo, .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, u32 btf_id, u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; u64 key = 0; int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; } break; case BPF_PROG_TYPE_EXT: if (prog->expected_attach_type != 0) { err = -EINVAL; goto out_put_prog; } break; case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type != BPF_LSM_MAC) { err = -EINVAL; goto out_put_prog; } break; default: err = -EINVAL; goto out_put_prog; } if (!!tgt_prog_fd != !!btf_id) { err = -EINVAL; goto out_put_prog; } if (tgt_prog_fd) { /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ if (prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } tgt_prog = bpf_prog_get(tgt_prog_fd); if (IS_ERR(tgt_prog)) { err = PTR_ERR(tgt_prog); tgt_prog = NULL; goto out_put_prog; } key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_prog; } bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); /* There are a few possible cases here: * * - if prog->aux->dst_trampoline is set, the program was just loaded * and not yet attached to anything, so we can use the values stored * in prog->aux * * - if prog->aux->dst_trampoline is NULL, the program has already been * attached to a target and its initial target was cleared (below) * * - if tgt_prog != NULL, the caller specified tgt_prog_fd + * target_btf_id using the link_create API. * * - if tgt_prog == NULL when this function was called using the old * raw_tracepoint_open API, and we need a target from prog->aux * * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program * was detached and is going for re-attachment. */ if (!prog->aux->dst_trampoline && !tgt_prog) { /* * Allow re-attach for TRACING and LSM programs. If it's * currently linked, bpf_trampoline_link_prog will fail. * EXT programs need to specify tgt_prog_fd, so they * re-attach in separate code path. */ if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM) { err = -EINVAL; goto out_unlock; } btf_id = prog->aux->attach_btf_id; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); } if (!prog->aux->dst_trampoline || (key && key != prog->aux->dst_trampoline->key)) { /* If there is no saved target, or the specified target is * different from the destination specified at load time, we * need a new trampoline and a check for compatibility */ struct bpf_attach_target_info tgt_info = {}; err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, &tgt_info); if (err) goto out_unlock; if (tgt_info.tgt_mod) { module_put(prog->aux->mod); prog->aux->mod = tgt_info.tgt_mod; } tr = bpf_trampoline_get(key, &tgt_info); if (!tr) { err = -ENOMEM; goto out_unlock; } } else { /* The caller didn't specify a target, or the target was the * same as the destination supplied during program load. This * means we can reuse the trampoline and reference from program * load time, and there is no need to allocate a new one. This * can only happen once for any program, as the saved values in * prog->aux are cleared below. */ tr = prog->aux->dst_trampoline; tgt_prog = prog->aux->dst_prog; } err = bpf_link_prime(&link->link.link, &link_primer); if (err) goto out_unlock; err = bpf_trampoline_link_prog(&link->link, tr); if (err) { bpf_link_cleanup(&link_primer); link = NULL; goto out_unlock; } link->tgt_prog = tgt_prog; link->trampoline = tr; /* Always clear the trampoline and target prog from prog->aux to make * sure the original attach destination is not kept alive after a * program is (re-)attached to another target. */ if (prog->aux->dst_prog && (tgt_prog_fd || tr != prog->aux->dst_trampoline)) /* got extra prog ref from syscall, or attaching to different prog */ bpf_prog_put(prog->aux->dst_prog); if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) /* we allocated a new trampoline, so free the old one */ bpf_trampoline_put(prog->aux->dst_trampoline); prog->aux->dst_prog = NULL; prog->aux->dst_trampoline = NULL; mutex_unlock(&prog->aux->dst_mutex); return bpf_link_settle(&link_primer); out_unlock: if (tr && tr != prog->aux->dst_trampoline) bpf_trampoline_put(tr); mutex_unlock(&prog->aux->dst_mutex); kfree(link); out_put_prog: if (tgt_prog_fd && tgt_prog) bpf_prog_put(tgt_prog); return err; } struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; }; static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); bpf_put_raw_tracepoint(raw_tp->btp); } static void bpf_raw_tp_link_dealloc(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); kfree(raw_tp); } static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_raw_tp_link *raw_tp_link = container_of(link, struct bpf_raw_tp_link, link); seq_printf(seq, "tp_name:\t%s\n", raw_tp_link->btp->tp->name); } static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, u32 len) { if (ulen >= len + 1) { if (copy_to_user(ubuf, buf, len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(ubuf, buf, ulen - 1)) return -EFAULT; if (put_user(zero, ubuf + ulen - 1)) return -EFAULT; return -ENOSPC; } return 0; } static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_raw_tp_link *raw_tp_link = container_of(link, struct bpf_raw_tp_link, link); char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); const char *tp_name = raw_tp_link->btp->tp->name; u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; if (!ubuf) return 0; return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); } static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #ifdef CONFIG_PERF_EVENTS struct bpf_perf_link { struct bpf_link link; struct file *perf_file; }; static void bpf_perf_link_release(struct bpf_link *link) { struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); struct perf_event *event = perf_link->perf_file->private_data; perf_event_free_bpf_prog(event); fput(perf_link->perf_file); } static void bpf_perf_link_dealloc(struct bpf_link *link) { struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); kfree(perf_link); } static int bpf_perf_link_fill_common(const struct perf_event *event, char __user *uname, u32 ulen, u64 *probe_offset, u64 *probe_addr, u32 *fd_type) { const char *buf; u32 prog_id; size_t len; int err; if (!ulen ^ !uname) return -EINVAL; err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, probe_offset, probe_addr); if (err) return err; if (!uname) return 0; if (buf) { len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; } else { char zero = '\0'; if (put_user(zero, uname)) return -EFAULT; } return 0; } #ifdef CONFIG_KPROBE_EVENTS static int bpf_perf_link_fill_kprobe(const struct perf_event *event, struct bpf_link_info *info) { char __user *uname; u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, &type); if (err) return err; if (type == BPF_FD_TYPE_KRETPROBE) info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; info->perf_event.kprobe.offset = offset; if (!kallsyms_show_value(current_cred())) addr = 0; info->perf_event.kprobe.addr = addr; return 0; } #endif #ifdef CONFIG_UPROBE_EVENTS static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { char __user *uname; u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, &type); if (err) return err; if (type == BPF_FD_TYPE_URETPROBE) info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; info->perf_event.uprobe.offset = offset; return 0; } #endif static int bpf_perf_link_fill_probe(const struct perf_event *event, struct bpf_link_info *info) { #ifdef CONFIG_KPROBE_EVENTS if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) return bpf_perf_link_fill_kprobe(event, info); #endif #ifdef CONFIG_UPROBE_EVENTS if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) return bpf_perf_link_fill_uprobe(event, info); #endif return -EOPNOTSUPP; } static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, struct bpf_link_info *info) { char __user *uname; u32 ulen; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL); } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, struct bpf_link_info *info) { info->perf_event.event.type = event->attr.type; info->perf_event.event.config = event->attr.config; info->perf_event.type = BPF_PERF_EVENT_EVENT; return 0; } static int bpf_perf_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_perf_link *perf_link; const struct perf_event *event; perf_link = container_of(link, struct bpf_perf_link, link); event = perf_get_event(perf_link->perf_file); if (IS_ERR(event)) return PTR_ERR(event); switch (event->prog->type) { case BPF_PROG_TYPE_PERF_EVENT: return bpf_perf_link_fill_perf_event(event, info); case BPF_PROG_TYPE_TRACEPOINT: return bpf_perf_link_fill_tracepoint(event, info); case BPF_PROG_TYPE_KPROBE: return bpf_perf_link_fill_probe(event, info); default: return -EOPNOTSUPP; } } static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, .fill_link_info = bpf_perf_link_fill_link_info, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_perf_link *link; struct perf_event *event; struct file *perf_file; int err; if (attr->link_create.flags) return -EINVAL; perf_file = perf_event_get(attr->link_create.target_fd); if (IS_ERR(perf_file)) return PTR_ERR(perf_file); link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_file; } bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); link->perf_file = perf_file; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_file; } event = perf_file->private_data; err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); if (err) { bpf_link_cleanup(&link_primer); goto out_put_file; } /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ bpf_prog_inc(prog); return bpf_link_settle(&link_primer); out_put_file: fput(perf_file); return err; } #else static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, const char __user *user_tp_name) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; const char *tp_name; char buf[128]; int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. */ return -EINVAL; if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; break; } return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) return -EFAULT; buf[sizeof(buf) - 1] = 0; tp_name = buf; break; default: return -EINVAL; } btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_btp; } bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; } return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); return err; } #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); if (fd < 0) bpf_prog_put(prog); return fd; } static enum bpf_prog_type attach_type_to_prog_type(enum bpf_attach_type attach_type) { switch (attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return BPF_PROG_TYPE_CGROUP_SKB; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: return BPF_PROG_TYPE_SOCK_OPS; case BPF_CGROUP_DEVICE: return BPF_PROG_TYPE_CGROUP_DEVICE; case BPF_SK_MSG_VERDICT: return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; case BPF_FLOW_DISSECTOR: return BPF_PROG_TYPE_FLOW_DISSECTOR; case BPF_CGROUP_SYSCTL: return BPF_PROG_TYPE_CGROUP_SYSCTL; case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: return BPF_PROG_TYPE_LSM; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; case BPF_LSM_CGROUP: return BPF_PROG_TYPE_LSM; case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; } } static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { enum bpf_prog_type ptype; switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; case BPF_PROG_TYPE_EXT: return 0; case BPF_PROG_TYPE_NETFILTER: if (attach_type != BPF_NETFILTER) return -EINVAL; return 0; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: if (attach_type != BPF_PERF_EVENT) return -EINVAL; return 0; case BPF_PROG_TYPE_KPROBE: if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: if (attach_type != BPF_TCX_INGRESS && attach_type != BPF_TCX_EGRESS) return -EINVAL; return 0; default: ptype = attach_type_to_prog_type(attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) return -EINVAL; return 0; } } #define BPF_PROG_ATTACH_LAST_FIELD expected_revision #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ BPF_F_REPLACE) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ BPF_F_BEFORE | \ BPF_F_AFTER | \ BPF_F_ID | \ BPF_F_LINK) static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; u32 mask; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; mask = bpf_mprog_supported(ptype) ? BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE; if (attr->attach_flags & ~mask) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { bpf_prog_put(prog); return -EINVAL; } switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: if (ptype == BPF_PROG_TYPE_LSM && prog->expected_attach_type != BPF_LSM_CGROUP) ret = -EINVAL; else ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; case BPF_PROG_TYPE_SCHED_CLS: ret = tcx_prog_attach(attr, prog); break; default: ret = -EINVAL; } if (ret) bpf_prog_put(prog); return ret; } #define BPF_PROG_DETACH_LAST_FIELD expected_revision static int bpf_prog_detach(const union bpf_attr *attr) { struct bpf_prog *prog = NULL; enum bpf_prog_type ptype; int ret; if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); if (bpf_mprog_supported(ptype)) { if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; if (attr->attach_bpf_fd) { prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); } } switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: ret = sock_map_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_detach(attr); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: ret = cgroup_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_SCHED_CLS: ret = tcx_prog_detach(attr, prog); break; default: ret = -EINVAL; } if (prog) bpf_prog_put(prog); return ret; } #define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) return -EINVAL; if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) return -EINVAL; switch (attr->query.attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_MSG_VERDICT: case BPF_SK_SKB_VERDICT: return sock_map_bpf_prog_query(attr, uattr); case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return tcx_prog_query(attr, uattr); default: return -EINVAL; } } #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_prog *prog; int ret = -ENOTSUPP; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; if ((attr->test.ctx_size_in && !attr->test.ctx_in) || (!attr->test.ctx_size_in && attr->test.ctx_in)) return -EINVAL; if ((attr->test.ctx_size_out && !attr->test.ctx_out) || (!attr->test.ctx_size_out && attr->test.ctx_out)) return -EINVAL; prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->aux->ops->test_run) ret = prog->aux->ops->test_run(prog, attr, uattr); bpf_prog_put(prog); return ret; } #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id static int bpf_obj_get_next_id(const union bpf_attr *attr, union bpf_attr __user *uattr, struct idr *idr, spinlock_t *lock) { u32 next_id = attr->start_id; int err = 0; if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; next_id++; spin_lock_bh(lock); if (!idr_get_next(idr, &next_id)) err = -ENOENT; spin_unlock_bh(lock); if (!err) err = put_user(next_id, &uattr->next_id); return err; } struct bpf_map *bpf_map_get_curr_or_next(u32 *id) { struct bpf_map *map; spin_lock_bh(&map_idr_lock); again: map = idr_get_next(&map_idr, id); if (map) { map = __bpf_map_inc_not_zero(map, false); if (IS_ERR(map)) { (*id)++; goto again; } } spin_unlock_bh(&map_idr_lock); return map; } struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) { struct bpf_prog *prog; spin_lock_bh(&prog_idr_lock); again: prog = idr_get_next(&prog_idr, id); if (prog) { prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) { (*id)++; goto again; } } spin_unlock_bh(&prog_idr_lock); return prog; } #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; if (!id) return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); if (prog) prog = bpf_prog_inc_not_zero(prog); else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); return prog; } static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) { struct bpf_prog *prog; u32 id = attr->prog_id; int fd; if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); fd = bpf_prog_new_fd(prog); if (fd < 0) bpf_prog_put(prog); return fd; } #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags static int bpf_map_get_fd_by_id(const union bpf_attr *attr) { struct bpf_map *map; u32 id = attr->map_id; int f_flags; int fd; if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; f_flags = bpf_get_file_flag(attr->open_flags); if (f_flags < 0) return f_flags; spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) map = __bpf_map_inc_not_zero(map, true); else map = ERR_PTR(-ENOENT); spin_unlock_bh(&map_idr_lock); if (IS_ERR(map)) return PTR_ERR(map); fd = bpf_map_new_fd(map, f_flags); if (fd < 0) bpf_map_put_with_uref(map); return fd; } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, unsigned long addr, u32 *off, u32 *type) { const struct bpf_map *map; int i; mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; goto out; } } map = NULL; out: mutex_unlock(&prog->aux->used_maps_mutex); return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; u32 off, type; u64 imm; u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), GFP_USER); if (!insns) return insns; for (i = 0; i < prog->len; i++) { code = insns[i].code; if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } if (code == (BPF_JMP | BPF_CALL) || code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; continue; } if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { insns[i].src_reg = type; insns[i].imm = map->id; insns[i + 1].imm = off; continue; } } return insns; } static int set_info_rec_size(struct bpf_prog_info *info) { /* * Ensure info.*_rec_size is the same as kernel expected size * * or * * Only allow zero *_rec_size if both _rec_size and _cnt are * zero. In this case, the kernel will set the expected * _rec_size back to the info. */ if ((info->nr_func_info || info->func_info_rec_size) && info->func_info_rec_size != sizeof(struct bpf_func_info)) return -EINVAL; if ((info->nr_line_info || info->line_info_rec_size) && info->line_info_rec_size != sizeof(struct bpf_line_info)) return -EINVAL; if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && info->jited_line_info_rec_size != sizeof(__u64)) return -EINVAL; info->func_info_rec_size = sizeof(struct bpf_func_info); info->line_info_rec_size = sizeof(struct bpf_line_info); info->jited_line_info_rec_size = sizeof(__u64); return 0; } static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct btf *attach_btf = bpf_prog_get_target_btf(prog); struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; char __user *uinsns; u32 ulen; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; info.type = prog->type; info.id = prog->aux->id; info.load_time = prog->aux->load_time; info.created_by_uid = from_kuid_munged(current_user_ns(), prog->aux->user->uid); info.gpl_compatible = prog->gpl_compatible; memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); if (ulen) { u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); u32 i; for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, &user_map_ids[i])) { mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; } } mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) return err; bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; info.nr_func_info = 0; info.nr_line_info = 0; info.nr_jited_line_info = 0; goto done; } ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { struct bpf_insn *insns_sanitized; bool fault; if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); fault = copy_to_user(uinsns, insns_sanitized, ulen); kfree(insns_sanitized); if (fault) return -EFAULT; } if (bpf_prog_is_offloaded(prog->aux)) { err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; goto done; } /* NOTE: the following code is supposed to be skipped for offload. * bpf_prog_offload_info_fill() is the place to fill similar fields * for offload. */ ulen = info.jited_prog_len; if (prog->aux->func_cnt) { u32 i; info.jited_prog_len = 0; for (i = 0; i < prog->aux->func_cnt; i++) info.jited_prog_len += prog->aux->func[i]->jited_len; } else { info.jited_prog_len = prog->jited_len; } if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); /* for multi-function programs, copy the JITed * instructions for all the functions */ if (prog->aux->func_cnt) { u32 len, free, i; u8 *img; free = ulen; for (i = 0; i < prog->aux->func_cnt; i++) { len = prog->aux->func[i]->jited_len; len = min_t(u32, len, free); img = (u8 *) prog->aux->func[i]->bpf_func; if (copy_to_user(uinsns, img, len)) return -EFAULT; uinsns += len; free -= len; if (!free) break; } } else { if (copy_to_user(uinsns, prog->bpf_func, ulen)) return -EFAULT; } } else { info.jited_prog_insns = 0; } } ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; /* copy the address of the kernel symbol * corresponding to each function */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); if (prog->aux->func_cnt) { for (i = 0; i < ulen; i++) { ksym_addr = (unsigned long) prog->aux->func[i]->bpf_func; if (put_user((u64) ksym_addr, &user_ksyms[i])) return -EFAULT; } } else { ksym_addr = (unsigned long) prog->bpf_func; if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { info.jited_ksyms = 0; } } ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); if (prog->aux->func_cnt) { for (i = 0; i < ulen; i++) { func_len = prog->aux->func[i]->jited_len; if (put_user(func_len, &user_lens[i])) return -EFAULT; } } else { func_len = prog->jited_len; if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { info.jited_func_lens = 0; } } if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; if (info.nr_func_info && ulen) { char __user *user_finfo; user_finfo = u64_to_user_ptr(info.func_info); ulen = min_t(u32, info.nr_func_info, ulen); if (copy_to_user(user_finfo, prog->aux->func_info, info.func_info_rec_size * ulen)) return -EFAULT; } ulen = info.nr_line_info; info.nr_line_info = prog->aux->nr_linfo; if (info.nr_line_info && ulen) { __u8 __user *user_linfo; user_linfo = u64_to_user_ptr(info.line_info); ulen = min_t(u32, info.nr_line_info, ulen); if (copy_to_user(user_linfo, prog->aux->linfo, info.line_info_rec_size * ulen)) return -EFAULT; } ulen = info.nr_jited_line_info; if (prog->aux->jited_linfo) info.nr_jited_line_info = prog->aux->nr_linfo; else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { unsigned long line_addr; __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { line_addr = (unsigned long)prog->aux->jited_linfo[i]; if (put_user((__u64)line_addr, &user_linfo[i])) return -EFAULT; } } else { info.jited_line_info = 0; } } ulen = info.nr_prog_tags; info.nr_prog_tags = prog->aux->func_cnt ? : 1; if (ulen) { __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; u32 i; user_prog_tags = u64_to_user_ptr(info.prog_tags); ulen = min_t(u32, info.nr_prog_tags, ulen); if (prog->aux->func_cnt) { for (i = 0; i < ulen; i++) { if (copy_to_user(user_prog_tags[i], prog->aux->func[i]->tag, BPF_TAG_SIZE)) return -EFAULT; } } else { if (copy_to_user(user_prog_tags[0], prog->tag, BPF_TAG_SIZE)) return -EFAULT; } } done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } static int bpf_map_get_info_by_fd(struct file *file, struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; info.map_extra = map->map_extra; memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { info.btf_id = btf_obj_id(map->btf); info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) return err; } if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } static int bpf_btf_get_info_by_fd(struct file *file, struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); if (err) return err; return btf_get_info_by_fd(btf, attr, uattr); } static int bpf_link_get_info_by_fd(struct file *file, struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_link_info info; u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; info.type = link->type; info.id = link->id; if (link->prog) info.prog_id = link->prog->aux->id; if (link->ops->fill_link_info) { err = link->ops->fill_link_info(link, &info); if (err) return err; } if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, union bpf_attr __user *uattr) { int ufd = attr->info.bpf_fd; struct fd f; int err; if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) return -EINVAL; f = fdget(ufd); if (!f.file) return -EBADFD; if (f.file->f_op == &bpf_prog_fops) err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; fdput(f); return err; } #define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; if (!bpf_capable()) return -EPERM; return btf_new_fd(attr, uattr, uattr_size); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return btf_get_fd_by_id(attr->btf_id); } static int bpf_task_fd_query_copy(const union bpf_attr *attr, union bpf_attr __user *uattr, u32 prog_id, u32 fd_type, const char *buf, u64 probe_offset, u64 probe_addr) { char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); u32 len = buf ? strlen(buf) : 0, input_len; int err = 0; if (put_user(len, &uattr->task_fd_query.buf_len)) return -EFAULT; input_len = attr->task_fd_query.buf_len; if (input_len && ubuf) { if (!len) { /* nothing to copy, just make ubuf NULL terminated */ char zero = '\0'; if (put_user(zero, ubuf)) return -EFAULT; } else if (input_len >= len + 1) { /* ubuf can hold the string with NULL terminator */ if (copy_to_user(ubuf, buf, len + 1)) return -EFAULT; } else { /* ubuf cannot hold the string with NULL terminator, * do a partial copy with NULL terminator. */ char zero = '\0'; err = -ENOSPC; if (copy_to_user(ubuf, buf, input_len - 1)) return -EFAULT; if (put_user(zero, ubuf + input_len - 1)) return -EFAULT; } } if (put_user(prog_id, &uattr->task_fd_query.prog_id) || put_user(fd_type, &uattr->task_fd_query.fd_type) || put_user(probe_offset, &uattr->task_fd_query.probe_offset) || put_user(probe_addr, &uattr->task_fd_query.probe_addr)) return -EFAULT; return err; } #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr static int bpf_task_fd_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { pid_t pid = attr->task_fd_query.pid; u32 fd = attr->task_fd_query.fd; const struct perf_event *event; struct task_struct *task; struct file *file; int err; if (CHECK_ATTR(BPF_TASK_FD_QUERY)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (attr->task_fd_query.flags != 0) return -EINVAL; rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); rcu_read_unlock(); if (!task) return -ENOENT; err = 0; file = fget_task(task, fd); put_task_struct(task); if (!file) return -EBADF; if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; err = bpf_task_fd_query_copy(attr, uattr, raw_tp->link.prog->aux->id, BPF_FD_TYPE_RAW_TRACEPOINT, btp->tp->name, 0, 0); goto put_file; } goto out_not_supp; } event = perf_get_event(file); if (!IS_ERR(event)) { u64 probe_offset, probe_addr; u32 prog_id, fd_type; const char *buf; err = bpf_get_perf_event_info(event, &prog_id, &fd_type, &buf, &probe_offset, &probe_addr); if (!err) err = bpf_task_fd_query_copy(attr, uattr, prog_id, fd_type, buf, probe_offset, probe_addr); goto put_file; } out_not_supp: err = -ENOTSUPP; put_file: fput(file); return err; } #define BPF_MAP_BATCH_LAST_FIELD batch.flags #define BPF_DO_BATCH(fn, ...) \ do { \ if (!fn) { \ err = -ENOTSUPP; \ goto err_put; \ } \ err = fn(__VA_ARGS__); \ } while (0) static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err, ufd; struct fd f; if (CHECK_ATTR(BPF_MAP_BATCH)) return -EINVAL; ufd = attr->batch.map_fd; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (has_write) bpf_map_write_active_inc(map); if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); else if (cmd == BPF_MAP_UPDATE_BATCH) BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr); else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: if (has_write) bpf_map_write_active_dec(map); fdput(f); return err; } #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; int ret; if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; if (attr->link_create.attach_type == BPF_STRUCT_OPS) return bpf_struct_ops_link_create(attr); prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) goto out; switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_EXT: ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: if (attr->link_create.attach_type != prog->expected_attach_type) { ret = -EINVAL; goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) ret = bpf_raw_tp_link_attach(prog, NULL); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; case BPF_PROG_TYPE_SCHED_CLS: ret = tcx_link_attach(attr, prog); break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); break; #endif case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: ret = bpf_perf_link_attach(attr, prog); break; case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: ret = -EINVAL; } out: if (ret < 0) bpf_prog_put(prog); return ret; } static int link_update_map(struct bpf_link *link, union bpf_attr *attr) { struct bpf_map *new_map, *old_map = NULL; int ret; new_map = bpf_map_get(attr->link_update.new_map_fd); if (IS_ERR(new_map)) return PTR_ERR(new_map); if (attr->link_update.flags & BPF_F_REPLACE) { old_map = bpf_map_get(attr->link_update.old_map_fd); if (IS_ERR(old_map)) { ret = PTR_ERR(old_map); goto out_put; } } else if (attr->link_update.old_map_fd) { ret = -EINVAL; goto out_put; } ret = link->ops->update_map(link, new_map, old_map); if (old_map) bpf_map_put(old_map); out_put: bpf_map_put(new_map); return ret; } #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd static int link_update(union bpf_attr *attr) { struct bpf_prog *old_prog = NULL, *new_prog; struct bpf_link *link; u32 flags; int ret; if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; flags = attr->link_update.flags; if (flags & ~BPF_F_REPLACE) return -EINVAL; link = bpf_link_get_from_fd(attr->link_update.link_fd); if (IS_ERR(link)) return PTR_ERR(link); if (link->ops->update_map) { ret = link_update_map(link, attr); goto out_put_link; } new_prog = bpf_prog_get(attr->link_update.new_prog_fd); if (IS_ERR(new_prog)) { ret = PTR_ERR(new_prog); goto out_put_link; } if (flags & BPF_F_REPLACE) { old_prog = bpf_prog_get(attr->link_update.old_prog_fd); if (IS_ERR(old_prog)) { ret = PTR_ERR(old_prog); old_prog = NULL; goto out_put_progs; } } else if (attr->link_update.old_prog_fd) { ret = -EINVAL; goto out_put_progs; } if (link->ops->update_prog) ret = link->ops->update_prog(link, new_prog, old_prog); else ret = -EINVAL; out_put_progs: if (old_prog) bpf_prog_put(old_prog); if (ret) bpf_prog_put(new_prog); out_put_link: bpf_link_put_direct(link); return ret; } #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd static int link_detach(union bpf_attr *attr) { struct bpf_link *link; int ret; if (CHECK_ATTR(BPF_LINK_DETACH)) return -EINVAL; link = bpf_link_get_from_fd(attr->link_detach.link_fd); if (IS_ERR(link)) return PTR_ERR(link); if (link->ops->detach) ret = link->ops->detach(link); else ret = -EOPNOTSUPP; bpf_link_put_direct(link); return ret; } static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } struct bpf_link *bpf_link_by_id(u32 id) { struct bpf_link *link; if (!id) return ERR_PTR(-ENOENT); spin_lock_bh(&link_idr_lock); /* before link is "settled", ID is 0, pretend it doesn't exist yet */ link = idr_find(&link_idr, id); if (link) { if (link->id) link = bpf_link_inc_not_zero(link); else link = ERR_PTR(-EAGAIN); } else { link = ERR_PTR(-ENOENT); } spin_unlock_bh(&link_idr_lock); return link; } struct bpf_link *bpf_link_get_curr_or_next(u32 *id) { struct bpf_link *link; spin_lock_bh(&link_idr_lock); again: link = idr_get_next(&link_idr, id); if (link) { link = bpf_link_inc_not_zero(link); if (IS_ERR(link)) { (*id)++; goto again; } } spin_unlock_bh(&link_idr_lock); return link; } #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id static int bpf_link_get_fd_by_id(const union bpf_attr *attr) { struct bpf_link *link; u32 id = attr->link_id; int fd; if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; link = bpf_link_by_id(id); if (IS_ERR(link)) return PTR_ERR(link); fd = bpf_link_new_fd(link); if (fd < 0) bpf_link_put_direct(link); return fd; } DEFINE_MUTEX(bpf_stats_enabled_mutex); static int bpf_stats_release(struct inode *inode, struct file *file) { mutex_lock(&bpf_stats_enabled_mutex); static_key_slow_dec(&bpf_stats_enabled_key.key); mutex_unlock(&bpf_stats_enabled_mutex); return 0; } static const struct file_operations bpf_stats_fops = { .release = bpf_stats_release, }; static int bpf_enable_runtime_stats(void) { int fd; mutex_lock(&bpf_stats_enabled_mutex); /* Set a very high limit to avoid overflow */ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { mutex_unlock(&bpf_stats_enabled_mutex); return -EBUSY; } fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); if (fd >= 0) static_key_slow_inc(&bpf_stats_enabled_key.key); mutex_unlock(&bpf_stats_enabled_mutex); return fd; } #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type static int bpf_enable_stats(union bpf_attr *attr) { if (CHECK_ATTR(BPF_ENABLE_STATS)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (attr->enable_stats.type) { case BPF_STATS_RUN_TIME: return bpf_enable_runtime_stats(); default: break; } return -EINVAL; } #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags static int bpf_iter_create(union bpf_attr *attr) { struct bpf_link *link; int err; if (CHECK_ATTR(BPF_ITER_CREATE)) return -EINVAL; if (attr->iter_create.flags) return -EINVAL; link = bpf_link_get_from_fd(attr->iter_create.link_fd); if (IS_ERR(link)) return PTR_ERR(link); err = bpf_iter_new_fd(link); bpf_link_put_direct(link); return err; } #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags static int bpf_prog_bind_map(union bpf_attr *attr) { struct bpf_prog *prog; struct bpf_map *map; struct bpf_map **used_maps_old, **used_maps_new; int i, ret = 0; if (CHECK_ATTR(BPF_PROG_BIND_MAP)) return -EINVAL; if (attr->prog_bind_map.flags) return -EINVAL; prog = bpf_prog_get(attr->prog_bind_map.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); map = bpf_map_get(attr->prog_bind_map.map_fd); if (IS_ERR(map)) { ret = PTR_ERR(map); goto out_prog_put; } mutex_lock(&prog->aux->used_maps_mutex); used_maps_old = prog->aux->used_maps; for (i = 0; i < prog->aux->used_map_cnt; i++) if (used_maps_old[i] == map) { bpf_map_put(map); goto out_unlock; } used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, sizeof(used_maps_new[0]), GFP_KERNEL); if (!used_maps_new) { ret = -ENOMEM; goto out_unlock; } memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; prog->aux->used_map_cnt++; prog->aux->used_maps = used_maps_new; kfree(used_maps_old); out_unlock: mutex_unlock(&prog->aux->used_maps_mutex); if (ret) bpf_map_put(map); out_prog_put: bpf_prog_put(prog); return ret; } static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; int err; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ memset(&attr, 0, sizeof(attr)); if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; err = security_bpf(cmd, &attr, size); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: err = map_create(&attr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); break; case BPF_MAP_UPDATE_ELEM: err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: err = map_delete_elem(&attr, uattr); break; case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; case BPF_MAP_FREEZE: err = map_freeze(&attr); break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr, size); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); break; case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr.user); break; case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr.user); break; case BPF_PROG_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &prog_idr, &prog_idr_lock); break; case BPF_MAP_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &map_idr, &map_idr_lock); break; case BPF_BTF_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &btf_idr, &btf_idr_lock); break; case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; case BPF_MAP_GET_FD_BY_ID: err = bpf_map_get_fd_by_id(&attr); break; case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr.user); break; case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: err = bpf_btf_load(&attr, uattr, size); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr.user); break; case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); break; case BPF_MAP_LOOKUP_AND_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_AND_DELETE_BATCH); break; case BPF_MAP_UPDATE_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); break; case BPF_MAP_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); break; case BPF_LINK_CREATE: err = link_create(&attr, uattr); break; case BPF_LINK_UPDATE: err = link_update(&attr); break; case BPF_LINK_GET_FD_BY_ID: err = bpf_link_get_fd_by_id(&attr); break; case BPF_LINK_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &link_idr, &link_idr_lock); break; case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; case BPF_LINK_DETACH: err = link_detach(&attr); break; case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; default: err = -EINVAL; break; } return err; } SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { return __sys_bpf(cmd, USER_BPFPTR(uattr), size); } static bool syscall_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= U16_MAX) return false; if (off % size != 0) return false; return true; } BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_DELETE_ELEM: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: case BPF_MAP_GET_FD_BY_ID: case BPF_PROG_LOAD: case BPF_BTF_LOAD: case BPF_LINK_CREATE: case BPF_RAW_TRACEPOINT_OPEN: break; default: return -EINVAL; } return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } /* To shut up -Wmissing-prototypes. * This function is used by the kernel light skeleton * to load bpf programs when modules are loaded or during kernel boot. * See tools/lib/bpf/skel_internal.h */ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) { struct bpf_prog * __maybe_unused prog; struct bpf_tramp_run_ctx __maybe_unused run_ctx; switch (cmd) { #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ case BPF_PROG_TEST_RUN: if (attr->test.data_in || attr->test.data_out || attr->test.ctx_out || attr->test.duration || attr->test.repeat || attr->test.flags) return -EINVAL; prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); if (IS_ERR(prog)) return PTR_ERR(prog); if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || attr->test.ctx_size_in > U16_MAX) { bpf_prog_put(prog); return -EINVAL; } run_ctx.bpf_cookie = 0; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); bpf_prog_put(prog); return 0; #endif default: return ____bpf_sys_bpf(cmd, attr, size); } } EXPORT_SYMBOL(kern_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; const struct bpf_func_proto * __weak tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return bpf_base_func_proto(func_id); } BPF_CALL_1(bpf_sys_close, u32, fd) { /* When bpf program calls this helper there should not be * an fdget() without matching completed fdput(). * This helper is allowed in the following callchain only: * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close */ return close_fd(fd); } static const struct bpf_func_proto bpf_sys_close_proto = { .func = bpf_sys_close, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) { if (flags) return -EINVAL; if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; if (!bpf_dump_raw_ok(current_cred())) return -EPERM; *res = kallsyms_lookup_name(name); return *res ? 0 : -ENOENT; } static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; static const struct bpf_func_proto * syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: return &bpf_sys_close_proto; case BPF_FUNC_kallsyms_lookup_name: return &bpf_kallsyms_lookup_name_proto; default: return tracing_prog_func_proto(func_id, prog); } } const struct bpf_verifier_ops bpf_syscall_verifier_ops = { .get_func_proto = syscall_prog_func_proto, .is_valid_access = syscall_prog_is_valid_access, }; const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; #ifdef CONFIG_SYSCTL static int bpf_stats_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; int val, ret; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&bpf_stats_enabled_mutex); val = saved_val; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret && val != saved_val) { if (val) static_key_slow_inc(key); else static_key_slow_dec(key); saved_val = val; } mutex_unlock(&bpf_stats_enabled_mutex); return ret; } void __weak unpriv_ebpf_notify(int new_state) { } static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, unpriv_enable = *(int *)table->data; bool locked_state = unpriv_enable == 1; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &unpriv_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (locked_state && unpriv_enable != 1) return -EPERM; *(int *)table->data = unpriv_enable; } if (write) unpriv_ebpf_notify(unpriv_enable); return ret; } static struct ctl_table bpf_syscall_table[] = { { .procname = "unprivileged_bpf_disabled", .data = &sysctl_unprivileged_bpf_disabled, .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", .data = &bpf_stats_enabled_key.key, .mode = 0644, .proc_handler = bpf_stats_handler, }, { } }; static int __init bpf_syscall_sysctl_init(void) { register_sysctl_init("kernel", bpf_syscall_table); return 0; } late_initcall(bpf_syscall_sysctl_init); #endif /* CONFIG_SYSCTL */ |
155 155 155 1107 155 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SHA-256, as specified in * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf * * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2014 Red Hat Inc. */ #include <asm/unaligned.h> #include <crypto/sha256_base.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> static const u32 SHA256_K[] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; static inline u32 Ch(u32 x, u32 y, u32 z) { return z ^ (x & (y ^ z)); } static inline u32 Maj(u32 x, u32 y, u32 z) { return (x & y) | (z & (x | y)); } #define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) #define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) #define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) #define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) static inline void LOAD_OP(int I, u32 *W, const u8 *input) { W[I] = get_unaligned_be32((__u32 *)input + I); } static inline void BLEND_OP(int I, u32 *W) { W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; } #define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \ u32 t1, t2; \ t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \ t2 = e0(a) + Maj(a, b, c); \ d += t1; \ h = t1 + t2; \ } while (0) static void sha256_transform(u32 *state, const u8 *input, u32 *W) { u32 a, b, c, d, e, f, g, h; int i; /* load the input */ for (i = 0; i < 16; i += 8) { LOAD_OP(i + 0, W, input); LOAD_OP(i + 1, W, input); LOAD_OP(i + 2, W, input); LOAD_OP(i + 3, W, input); LOAD_OP(i + 4, W, input); LOAD_OP(i + 5, W, input); LOAD_OP(i + 6, W, input); LOAD_OP(i + 7, W, input); } /* now blend */ for (i = 16; i < 64; i += 8) { BLEND_OP(i + 0, W); BLEND_OP(i + 1, W); BLEND_OP(i + 2, W); BLEND_OP(i + 3, W); BLEND_OP(i + 4, W); BLEND_OP(i + 5, W); BLEND_OP(i + 6, W); BLEND_OP(i + 7, W); } /* load the state into our registers */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; f = state[5]; g = state[6]; h = state[7]; /* now iterate */ for (i = 0; i < 64; i += 8) { SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); } state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; } static void sha256_transform_blocks(struct sha256_state *sctx, const u8 *input, int blocks) { u32 W[64]; do { sha256_transform(sctx->state, input, W); input += SHA256_BLOCK_SIZE; } while (--blocks); memzero_explicit(W, sizeof(W)); } void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) { lib_sha256_base_do_update(sctx, data, len, sha256_transform_blocks); } EXPORT_SYMBOL(sha256_update); static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_size) { lib_sha256_base_do_finalize(sctx, sha256_transform_blocks); lib_sha256_base_finish(sctx, out, digest_size); } void sha256_final(struct sha256_state *sctx, u8 *out) { __sha256_final(sctx, out, 32); } EXPORT_SYMBOL(sha256_final); void sha224_final(struct sha256_state *sctx, u8 *out) { __sha256_final(sctx, out, 28); } EXPORT_SYMBOL(sha224_final); void sha256(const u8 *data, unsigned int len, u8 *out) { struct sha256_state sctx; sha256_init(&sctx); sha256_update(&sctx, data, len); sha256_final(&sctx, out); } EXPORT_SYMBOL(sha256); MODULE_LICENSE("GPL"); |
5747 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KASAN_H #define _LINUX_KASAN_H #include <linux/bug.h> #include <linux/kasan-enabled.h> #include <linux/kernel.h> #include <linux/static_key.h> #include <linux/types.h> struct kmem_cache; struct page; struct slab; struct vm_struct; struct task_struct; #ifdef CONFIG_KASAN #include <linux/linkage.h> #include <asm/kasan.h> #endif typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u) #define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include <linux/pgtable.h> /* Software KASAN implementations use shadow memory. */ #ifdef CONFIG_KASAN_SW_TAGS /* This matches KASAN_TAG_INVALID. */ #define KASAN_SHADOW_INIT 0xFE #else #define KASAN_SHADOW_INIT 0 #endif #ifndef PTE_HWTABLE_PTRS #define PTE_HWTABLE_PTRS 0 #endif extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]; extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD]; extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); #ifndef __HAVE_ARCH_SHADOW_MAP static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } #endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); /* Enable reporting bugs after kasan_disable_current() */ extern void kasan_enable_current(void); /* Disable reporting bugs for current task */ extern void kasan_disable_current(void); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline int kasan_add_zero_shadow(void *start, unsigned long size) { return 0; } static inline void kasan_remove_zero_shadow(void *start, unsigned long size) {} static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #ifdef CONFIG_KASAN_HW_TAGS #else /* CONFIG_KASAN_HW_TAGS */ #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) { return kasan_hw_tags_enabled(); } #ifdef CONFIG_KASAN void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { if (kasan_enabled()) __kasan_unpoison_range(addr, size); } void __kasan_poison_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) __kasan_poison_pages(page, order, init); } bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) return __kasan_unpoison_pages(page, order, init); return false; } void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { if (kasan_enabled()) __kasan_poison_slab(slab); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { if (kasan_enabled()) __kasan_unpoison_object_data(cache, object); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object); static __always_inline void kasan_poison_object_data(struct kmem_cache *cache, void *object) { if (kasan_enabled()) __kasan_poison_object_data(cache, object); } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object); static __always_inline void * __must_check kasan_init_slab_obj( struct kmem_cache *cache, const void *object) { if (kasan_enabled()) return __kasan_init_slab_obj(cache, object); return (void *)object; } bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip, bool init); static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { if (kasan_enabled()) return __kasan_slab_free(s, object, _RET_IP_, init); return false; } void __kasan_kfree_large(void *ptr, unsigned long ip); static __always_inline void kasan_kfree_large(void *ptr) { if (kasan_enabled()) __kasan_kfree_large(ptr, _RET_IP_); } void __kasan_slab_free_mempool(void *ptr, unsigned long ip); static __always_inline void kasan_slab_free_mempool(void *ptr) { if (kasan_enabled()) __kasan_slab_free_mempool(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( struct kmem_cache *s, void *object, gfp_t flags, bool init) { if (kasan_enabled()) return __kasan_slab_alloc(s, object, flags, init); return object; } void * __must_check __kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags); static __always_inline void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) { if (kasan_enabled()) return __kasan_kmalloc(s, object, size, flags); return (void *)object; } void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); static __always_inline void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { if (kasan_enabled()) return __kasan_kmalloc_large(ptr, size, flags); return (void *)ptr; } void * __must_check __kasan_krealloc(const void *object, size_t new_size, gfp_t flags); static __always_inline void * __must_check kasan_krealloc(const void *object, size_t new_size, gfp_t flags) { if (kasan_enabled()) return __kasan_krealloc(object, new_size, flags); return (void *)object; } /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. */ bool __kasan_check_byte(const void *addr, unsigned long ip); static __always_inline bool kasan_check_byte(const void *addr) { if (kasan_enabled()) return __kasan_check_byte(addr, _RET_IP_); return true; } #else /* CONFIG_KASAN */ static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { return false; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} static inline void kasan_poison_object_data(struct kmem_cache *cache, void *object) {} static inline void *kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { return (void *)object; } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; } static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { return object; } static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) { return (void *)object; } static inline void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { return (void *)ptr; } static inline void *kasan_krealloc(const void *object, size_t new_size, gfp_t flags) { return (void *)object; } static inline bool kasan_check_byte(const void *address) { return true; } #endif /* CONFIG_KASAN */ #if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) void kasan_unpoison_task_stack(struct task_struct *task); #else static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache { int alloc_meta_offset; int free_meta_offset; }; size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ /* Tag-based KASAN modes do not use per-object metadata. */ static inline size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { return 0; } /* And thus nothing prevents cache merging. */ static inline slab_flags_t kasan_never_merge(void) { return 0; } /* And no cache-related metadata initialization is required. */ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) static inline void *kasan_reset_tag(const void *addr) { return (void *)arch_kasan_reset_tag(addr); } /** * kasan_report - print a report about a bad memory access detected by KASAN * @addr: address of the bad access * @size: size of the bad access * @is_write: whether the bad access is a write or a read * @ip: instruction pointer for the accessibility check or the bad access itself */ bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline void *kasan_reset_tag(const void *addr) { return (void *)addr; } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/ #ifdef CONFIG_KASAN_HW_TAGS void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else static inline void kasan_init_sw_tags(void) { } #endif #ifdef CONFIG_KASAN_HW_TAGS void kasan_init_hw_tags_cpu(void); void __init kasan_init_hw_tags(void); #else static inline void kasan_init_hw_tags_cpu(void) { } static inline void kasan_init_hw_tags(void) { } #endif #ifdef CONFIG_KASAN_VMALLOC #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end) { } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags); static __always_inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { if (kasan_enabled()) return __kasan_unpoison_vmalloc(start, size, flags); return (void *)start; } void __kasan_poison_vmalloc(const void *start, unsigned long size); static __always_inline void kasan_poison_vmalloc(const void *start, unsigned long size) { if (kasan_enabled()) __kasan_poison_vmalloc(start, size); } #else /* CONFIG_KASAN_VMALLOC */ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end) { } static inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { return (void *)start; } static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) /* * These functions allocate and free shadow memory for kernel modules. * They are only required when KASAN_VMALLOC is not supported, as otherwise * shadow memory is allocated by the generic vmalloc handlers. */ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ #ifdef CONFIG_KASAN_INLINE void kasan_non_canonical_hook(unsigned long addr); #else /* CONFIG_KASAN_INLINE */ static inline void kasan_non_canonical_hook(unsigned long addr) { } #endif /* CONFIG_KASAN_INLINE */ #endif /* LINUX_KASAN_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM task #if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TASK_H #include <linux/tracepoint.h> TRACE_EVENT(task_newtask, TP_PROTO(struct task_struct *task, unsigned long clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) __field( unsigned long, clone_flags) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->clone_flags = clone_flags; __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); TRACE_EVENT(task_rename, TP_PROTO(struct task_struct *task, const char *comm), TP_ARGS(task, comm), TP_STRUCT__entry( __field( pid_t, pid) __array( char, oldcomm, TASK_COMM_LEN) __array( char, newcomm, TASK_COMM_LEN) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); strscpy(entry->newcomm, comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd", __entry->pid, __entry->oldcomm, __entry->newcomm, __entry->oom_score_adj) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h> |
499 483 483 482 1118 1118 1118 1118 1118 5294 2994 1118 1118 407 407 2159 661 1330 39 11027 945 129 219 680 6 1117 5412 60 1117 1117 1438 4129 3171 760 483 1536 81 8954 4631 1258 5 8999 9001 1544 1543 5720 3585 1537 1250 1118 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) { max_mapnr = limit; } #else static inline void set_max_mapnr(unsigned long limit) { } #endif extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; extern int page_cluster; extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern const int mmap_rnd_bits_max; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); /* Use only if VMA has no other users */ void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #ifdef CONFIG_PPC # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #else # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_IA64) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 38 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. */ static inline bool vma_start_read(struct vm_area_struct *vma) { /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) return false; /* * Overflow might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } return true; } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ up_read(&vma->vm_lock->lock); rcu_read_unlock(); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ *mm_lock_seq = vma->vm_mm->mm_lock_seq; return (vma->vm_lock_seq == *mm_lock_seq); } /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently * write-locked mmap_lock is dropped or downgraded. */ static inline void vma_start_write(struct vm_area_struct *vma) { int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; down_write(&vma->vm_lock->lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { if (!rwsem_is_locked(&vma->vm_lock->lock)) vma_assert_write_locked(vma); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ if (detached) vma_assert_write_locked(vma); vma->detached = detached; } static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); #else /* CONFIG_PER_VMA_LOCK */ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { return NULL; } static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start <= vma->vm_mm->brk && vma->vm_end >= vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio->_flags_1 & 0xff; } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio->_flags_1 & 0xff; } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes - it does not include PTE-mapped sub-pages; look * at folio_mapcount() or page_mapcount() or total_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_entire_mapcount) + 1; } /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } /** * page_mapcount() - Number of times this precise page is mapped. * @page: The page. * * The number of times this page is mapped. If this page is part of * a large folio, it includes the number of times this page is mapped * as part of that folio. * * The result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). * They use this field in struct page differently. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; if (unlikely(PageCompound(page))) mapcount += folio_entire_mapcount(page_folio(page)); return mapcount; } int folio_total_mapcount(struct folio *folio); /** * folio_mapcount() - Calculate the number of mappings of this folio. * @folio: The folio. * * A large folio tracks both how many times the entire folio is mapped, * and how many times each individual page in the folio is mapped. * This function calculates the total number of times the folio is * mapped. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; return folio_total_mapcount(folio); } static inline int total_mapcount(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; return folio_total_mapcount(page_folio(page)); } static inline bool folio_large_is_mapped(struct folio *folio) { /* * Reading _entire_mapcount below could be omitted if hugetlb * participated in incrementing nr_pages_mapped when compound mapped. */ return atomic_read(&folio->_nr_pages_mapped) > 0 || atomic_read(&folio->_entire_mapcount) >= 0; } /** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables. */ static inline bool folio_mapped(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) >= 0; return folio_large_is_mapped(folio); } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ static inline bool page_mapped(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) >= 0; return folio_large_is_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } static inline struct folio *virt_to_folio(const void *x) { struct page *page = virt_to_page(x); return page_folio(page); } void __folio_put(struct folio *folio); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); void destroy_large_folio(struct folio *folio); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page); } /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte, vma); return pte; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A page may belong to an inode's memory mapping. In this case, page->mapping * is the pointer to the inode, and page->index is the file offset of the page, * in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_page_refs(struct page *page, int refs); static inline bool put_devmap_managed_page_refs(struct page *page, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; return __put_devmap_managed_page_refs(page, refs); } #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page_refs(struct page *page, int refs) { return false; } #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page(struct page *page) { return put_devmap_managed_page_refs(page, 1); } /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) /** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use. */ static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void get_page(struct page *page) { folio_get(page_folio(page)); } static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } /** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) __folio_put(folio); } /** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) __folio_put(folio); } /* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away. */ typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * @nr: How many folios there are. * * Like folio_put(), but for an array of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which * need to be taken if the folios are freed. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folios_put(struct folio **folios, unsigned int nr) { release_pages(folios, nr); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: */ if (put_devmap_managed_page(&folio->page)) return; folio_put(folio); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), page_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS extern int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { struct page *p = (struct page *)page; return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif static inline int folio_nid(const struct folio *folio) { return page_to_nid(&folio->page); } #ifdef CONFIG_NUMA_BALANCING /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS #define PAGE_ACCESS_TIME_BUCKETS \ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) #else #define PAGE_ACCESS_TIME_BUCKETS 0 #endif #define PAGE_ACCESS_TIME_MASK \ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int page_cpupid_last(struct page *page) { return page->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int page_cpupid_last(struct page *page) { return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } extern int page_cpupid_xchg_last(struct page *page, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ static inline int xchg_page_access_time(struct page *page, int time) { int last_time; last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) { __set_bit(pid_bit, &vma->numab_state->access_pids[1]); } } #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return page_to_nid(page); /* XXX */ } static inline int xchg_page_access_time(struct page *page, int time) { return 0; } static inline int page_cpupid_last(struct page *page) { return page_to_nid(page); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag = 0xff; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { unsigned long old_flags, flags; if (!kasan_enabled()) return; tag ^= 0xff; old_flags = READ_ONCE(page->flags); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) page_kasan_tag_set(page, 0xff); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } static inline struct zone *folio_zone(const struct folio *folio) { return page_zone(&folio->page); } static inline pg_data_t *folio_pgdat(const struct folio *folio) { return page_pgdat(&folio->page); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif /** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio. */ static inline unsigned long folio_pfn(struct folio *folio) { return page_to_pfn(&folio->page); } static inline struct folio *pfn_folio(unsigned long pfn) { return page_folio(pfn_to_page(pfn)); } /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the page has been "dma-pinned". * False, if the page is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)folio_ref_count(folio)) >= GUP_PIN_COUNTING_BIAS; } static inline bool page_maybe_dma_pinned(struct page *page) { return folio_maybe_dma_pinned(page_folio(page)); } /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return page_maybe_dma_pinned(page); } /** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages. */ static inline bool is_zero_page(const struct page *page) { return is_zero_pfn(page_to_pfn(page)); } /** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages. */ static inline bool is_zero_folio(const struct folio *folio) { return is_zero_page(&folio->page); } /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) return false; /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); } #else static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } /** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two. */ static inline long folio_nr_pages(struct folio *folio) { if (!folio_test_large(folio)) return 1; #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else return 1L << (folio->_flags_1 & 0xff); #endif } /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ static inline unsigned long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else return 1L << (folio->_flags_1 & 0xff); #endif } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio. */ static inline struct folio *folio_next(struct folio *folio) { return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } /** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ static inline unsigned int folio_shift(struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } /** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ static inline size_t folio_size(struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** * folio_estimated_sharers - Estimate the number of sharers of a folio. * @folio: The folio. * * folio_estimated_sharers() aims to serve as a function to efficiently * estimate the number of processes sharing a folio. This is done by * looking at the precise mapcount of the first subpage in the folio, and * assuming the other subpages are the same. This may not be true for large * folios. If you want exact mapcounts for exact calculations, look at * page_mapcount() or folio_total_mapcount(). * * Return: The estimated number of processes sharing a folio. */ static inline int folio_estimated_sharers(struct folio *folio) { return page_mapcount(folio_page(folio, 0)); } #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { return 0; } #endif #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { int ret; long i, nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) { ret = arch_make_page_accessible(folio_page(folio, i)); if (ret) break; } return ret; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif static inline void *folio_address(const struct folio *folio) { return page_address(&folio->page); } extern pgoff_t __page_file_index(struct page *page); /* * Return the pagecache index of the passed page. Regular pagecache pages * use ->index whereas swapcache pages use swp_offset(->private) */ static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) return __page_file_index(page); return page->index; } /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(const struct page *page) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)page->lru.next & BIT(1); } /* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)folio->lru.next & BIT(1); } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { page->lru.next = NULL; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return raw_smp_processor_id(); } #endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); static inline void zap_vma_pages(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_page(struct address_space *mapping, struct page *page); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, struct vm_area_struct **vmap) { struct page *page; struct vm_area_struct *vma; int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); if (got == 0) return NULL; vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { put_page(page); return ERR_PTR(-EINVAL); } *vmap = vma; return page; } long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks); /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping. */ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) bool vma_needs_dirty_tracking(struct vm_area_struct *vma); int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; } /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { return percpu_counter_read_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { percpu_counter_dec(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } /* Optimized variant when page is already known not to be PageAnon */ static inline int mm_counter_file(struct page *page) { if (PageSwapBacked(page)) return MM_SHMEMPAGES; return MM_FILEPAGES; } static inline int mm_counter(struct page *page) { if (PageAnon(page)) return MM_ANONPAGES; return mm_counter_file(page); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); } static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); } static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); if ((mm)->hiwater_rss < _rss) (mm)->hiwater_rss = _rss; } static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; } static inline void reset_mm_hiwater_rss(struct mm_struct *mm) { mm->hiwater_rss = get_mm_rss(mm); } static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { unsigned long hiwater_rss = get_mm_hiwater_rss(mm); if (*maxrss < hiwater_rss) *maxrss = hiwater_rss; } #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm); #else static inline void sync_mm_rss(struct mm_struct *mm) { } #endif #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #endif #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; } #endif extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pte_t *ptep; __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); return ptep; } #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { atomic_long_set(&mm->pgtables_bytes, 0); } static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? NULL : p4d_offset(pgd, address); } static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } static inline void *ptdesc_to_virt(const struct ptdesc *pt) { return page_to_virt(ptdesc_page(pt)); } static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); } static inline bool pagetable_is_reserved(struct ptdesc *pt) { return folio_test_reserved(ptdesc_folio(pt)); } /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. * * Return: The ptdesc describing the allocated page tables. */ static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) { struct page *page = alloc_pages(gfp | __GFP_COMP, order); return page_ptdesc(page); } /** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself. */ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); __free_pages(page, compound_order(page)); } #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) { } static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) return false; spin_lock_init(ptlock_ptr(ptdesc)); return true; } #else /* !USE_SPLIT_PTE_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); if (!ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pte_t *pte; __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); return pte; } pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if USE_SPLIT_PMD_PTLOCKS static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { return page_ptdesc(pmd_pgtable_page(pmd)); } static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif return ptlock_init(ptdesc); } static inline void pmd_ptlock_free(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); #endif ptlock_free(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) #endif static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; } static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); pmd_ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock. */ static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) { return &mm->page_table_lock; } static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) { spinlock_t *ptl = pud_lockptr(mm, pud); spin_lock(ptl); return ptl; } extern void __init pagecache_init(void); extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void free_reserved_page(struct page *page) { ClearPageReserved(page); init_page_count(page); __free_page(page); adjust_managed_page_count(page, 1); } #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { SetPageReserved(page); adjust_managed_page_count(page, -1); } static inline void free_reserved_ptdesc(struct ptdesc *pt) { free_reserved_page(ptdesc_page(pt)); } /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) { int nid; unsigned long phys_pages = 0; for_each_online_node(nid) phys_pages += node_present_pages(nid); return phys_pages; } /* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); static inline void show_mem(void) { __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); #ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES extern unsigned long arch_reserved_kernel_pages(void); #endif extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); #define vma_interval_tree_foreach(vma, root, start, last) \ for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root); struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node); #endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next); extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, unsigned long addr, int new_below); extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, unsigned long end_data, unsigned long start_data) { if (rlim < RLIM_INFINITY) { if (((new - start) + (end_data - start_data)) > rlim) return -ENOSPC; } return 0; } extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); /* This is an obsolete alternative to _install_special_mapping. */ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ (void) __mm_populate(addr, len, 1); } #else static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* These take the mm semaphore themselves */ extern int __must_check vm_brk(unsigned long, unsigned long); extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 unsigned long flags; unsigned long length; unsigned long low_limit; unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ int expand_downwards(struct vm_area_struct *vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); /* * Look up the first VMA which intersects the interval [start_addr, end_addr) * NULL if none. Assume start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address * @mm: The process address space. * @addr: The user address. * * Return: The vm_area_struct at the given address, %NULL otherwise. */ static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { return mtree_load(&mm->mm_mt, addr); } static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; /* See reasoning around the VM_SHADOW_STACK definition */ if (vma->vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; vm_start -= gap; if (vm_start > vma->vm_start) vm_start = 0; return vm_start; } static inline unsigned long vm_end_gap(struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; if (vma->vm_flags & VM_GROWSUP) { vm_end += stack_guard_gap; if (vm_end < vma->vm_end) vm_end = -PAGE_SIZE; } return vm_end; } static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; return vma; } static inline bool range_in_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); } #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) { return __pgprot(0); } static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); } #endif void vma_set_file(struct vm_area_struct *vma, struct file *file); #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { int err = vm_insert_page(vma, addr, page); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } #ifndef io_remap_pfn_range static inline int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); } #endif static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; else if (err == -EHWPOISON) return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; } /* * Convert errno to return value for ->page_mkwrite() calls. * * This should eventually be merged with vmf_error() above, but will need a * careful audit of all vmf_error() callers. */ static inline vm_fault_t vmf_fs_error(int err) { if (err == 0) return VM_FAULT_LOCKED; if (err == -EFAULT || err == -EAGAIN) return VM_FAULT_NOPAGE; if (err == -ENOMEM) return VM_FAULT_OOM; /* -ENOSPC, -EDQUOT, -EIO ... */ return VM_FAULT_SIGBUS; } struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; return 0; } /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, unsigned int flags) { /* * If callers don't want to honor NUMA hinting faults, no need to * determine if we would actually have to trigger a NUMA hinting fault. */ if (!(flags & FOLL_HONOR_NUMA_FAULT)) return true; /* * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. * * Requiring a fault here even for inaccessible VMAs would mean that * FOLL_FORCE cannot make any progress, because handle_mm_fault() * refuses to process NUMA hinting faults in inaccessible VMAs. */ return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); extern bool _page_poisoning_enabled_early; DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); static inline bool page_poisoning_enabled(void) { return _page_poisoning_enabled_early; } /* * For use in fast paths after init_mem_debugging() has run, or when a * false negative result is not harmful when called too early. */ static inline bool page_poisoning_enabled_static(void) { return static_branch_unlikely(&_page_poisoning_enabled); } static inline void kernel_poison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_poison_pages(page, numpages); } static inline void kernel_unpoison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_unpoison_pages(page, numpages); } #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) return true; return flags & __GFP_ZERO; } DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free); } extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled_early; } /* * For use in fast paths after mem_debugging_and_hardening_init() has run, * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) return false; return static_branch_unlikely(&_debug_pagealloc_enabled); } /* * To support DEBUG_PAGEALLOC architecture must ensure that * __kernel_map_pages() never fails */ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } static inline bool debug_guardpage_enabled(void) { return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) { if (!debug_guardpage_enabled()) return false; return PageGuard(page); } bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { if (!debug_guardpage_enabled()) return false; return __set_page_guard(zone, page, order, migratetype); } void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype); static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { if (!debug_guardpage_enabled()) return; __clear_page_guard(zone, page, order, migratetype); } #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } #endif /* __HAVE_ARCH_GATE_AREA */ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); #endif void drop_slab(void); #ifndef CONFIG_MMU #define randomize_va_space 0 #else extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); #ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); #else static inline void print_vma_addr(char *prefix, unsigned long rip) { } #endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); void pmd_init(void *addr); void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, struct page *reuse); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next); int vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long nr_pages; unsigned long nr_vmemmap_pages; if (!pgmap || !is_power_of_2(sizeof(struct page))) return false; nr_pages = pgmap_vmemmap_nr(pgmap); nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); /* * For vmemmap optimization with DAX we need minimum 2 vmemmap * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst */ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } /* * If we don't have an architecture override, use the generic rule */ #ifndef vmemmap_can_optimize #define vmemmap_can_optimize __vmemmap_can_optimize #endif #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { return false; } #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE /* * Sysfs entries for memory failure handling statistics. */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { } static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void num_poisoned_pages_inc(unsigned long pfn) { } static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { } #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM) void add_to_kill_ksm(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); #else static inline void memblk_nr_poison_inc(unsigned long pfn) { } static inline void memblk_nr_poison_sub(unsigned long pfn, long i) { } #endif #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) { return -ENXIO; } #endif #ifndef arch_is_platform_page static inline bool arch_is_platform_page(u64 paddr) { return false; } #endif /* * Error handlers for various types of pages. */ enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ MF_RECOVERED, /* Successfully recovered */ }; enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, MF_MSG_DIRTY_MLOCKED_LRU, MF_MSG_CLEAN_MLOCKED_LRU, MF_MSG_DIRTY_UNEVICTABLE_LRU, MF_MSG_CLEAN_UNEVICTABLE_LRU, MF_MSG_DIRTY_LRU, MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? * @vma: Pointer to the struct vm_area_struct to consider * * Whether transhuge page-table entries are considered "special" following * the definition in vm_normal_page(). * * Return: true if transhuge page-table entries should be considered special, * false otherwise. */ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) { return vma_is_dax(vma) || (vma->vm_file && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else static inline void setup_nr_node_ids(void) {} #endif extern int memcmp_pages(struct page *page1, struct page *page2); static inline int pages_identical(struct page *page1, struct page *page2) { return !memcmp_pages(page1, page2); } #ifdef CONFIG_MAPPING_DIRTY_HELPERS unsigned long clean_record_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, pgoff_t bitmap_pgoff, unsigned long *bitmap, pgoff_t *start, pgoff_t *end); unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif extern int sysctl_nr_trim_pages; #ifdef CONFIG_PRINTK void mem_dump_obj(void *object); #else static inline void mem_dump_obj(void *object) {} #endif /** * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it * @seals: the seals to check * @vma: the vma to operate on * * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on * the vma flags. Return 0 if check pass, or <0 for errors. */ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) { if (seals & F_SEAL_FUTURE_WRITE) { /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when * "future write" seal active. */ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) return -EPERM; /* * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as * MAP_SHARED and read-only, take care to not allow mprotect to * revert protections on such mappings. Do this only for shared * mappings. For private mappings, don't need to mask * VM_MAYWRITE as we still want them to be COW-writable. */ if (vma->vm_flags & VM_SHARED) vm_flags_clear(vma, VM_MAYWRITE); } return 0; } #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif #ifdef CONFIG_UNACCEPTED_MEMORY bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); void accept_memory(phys_addr_t start, phys_addr_t end); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) { return false; } static inline void accept_memory(phys_addr_t start, phys_addr_t end) { } #endif #endif /* _LINUX_MM_H */ |
303 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MINMAX_H #define _LINUX_MINMAX_H #include <linux/const.h> #include <linux/types.h> /* * min()/max()/clamp() macros must accomplish three things: * * - avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. * - perform strict type-checking (to generate warnings instead of * nasty runtime surprises). See the "unnecessary" pointer comparison * in __typecheck(). * - retain result as a constant expressions when called with only * constant expressions (to avoid tripping VLA warnings in stack * allocation usage). */ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) #define __no_side_effects(x, y) \ (__is_constexpr(x) && __is_constexpr(y)) #define __safe_cmp(x, y) \ (__typecheck(x, y) && __no_side_effects(x, y)) #define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) #define __cmp_once(x, y, unique_x, unique_y, op) ({ \ typeof(x) unique_x = (x); \ typeof(y) unique_y = (y); \ __cmp(unique_x, unique_y, op); }) #define __careful_cmp(x, y, op) \ __builtin_choose_expr(__safe_cmp(x, y), \ __cmp(x, y, op), \ __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) #define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ typeof(val) unique_val = (val); \ typeof(lo) unique_lo = (lo); \ typeof(hi) unique_hi = (hi); \ __clamp(unique_val, unique_lo, unique_hi); }) #define __clamp_input_check(lo, hi) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ __is_constexpr((lo) > (hi)), (lo) > (hi), false))) #define __careful_clamp(val, lo, hi) ({ \ __clamp_input_check(lo, hi) + \ __builtin_choose_expr(__typecheck(val, lo) && __typecheck(val, hi) && \ __typecheck(hi, lo) && __is_constexpr(val) && \ __is_constexpr(lo) && __is_constexpr(hi), \ __clamp(val, lo, hi), \ __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \ __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); }) /** * min - return minimum of two values of the same or compatible types * @x: first value * @y: second value */ #define min(x, y) __careful_cmp(x, y, <) /** * max - return maximum of two values of the same or compatible types * @x: first value * @y: second value */ #define max(x, y) __careful_cmp(x, y, >) /** * min3 - return minimum of three values * @x: first value * @y: second value * @z: third value */ #define min3(x, y, z) min((typeof(x))min(x, y), z) /** * max3 - return maximum of three values * @x: first value * @y: second value * @z: third value */ #define max3(x, y, z) max((typeof(x))max(x, y), z) /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero * @x: value1 * @y: value2 */ #define min_not_zero(x, y) ({ \ typeof(x) __x = (x); \ typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value * @lo: lowest allowable value * @hi: highest allowable value * * This macro does strict typechecking of @lo/@hi to make sure they are of the * same type as @val. See the unnecessary pointer comparisons. */ #define clamp(val, lo, hi) __careful_clamp(val, lo, hi) /* * ..and if you can't take the strict * types, you can specify one yourself. * * Or not use min/max/clamp at all, of course. */ /** * min_t - return minimum of two values, using the specified type * @type: data type to use * @x: first value * @y: second value */ #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) /** * max_t - return maximum of two values, using the specified type * @type: data type to use * @x: first value * @y: second value */ #define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) /* * Remove a const qualifier from integer types * _Generic(foo, type-name: association, ..., default: association) performs a * comparison against the foo type (not the qualified type). * Do not use the const keyword in the type-name as it will not match the * unqualified type of foo. */ #define __unconst_integer_type_cases(type) \ unsigned type: (unsigned type)0, \ signed type: (signed type)0 #define __unconst_integer_typeof(x) typeof( \ _Generic((x), \ char: (char)0, \ __unconst_integer_type_cases(char), \ __unconst_integer_type_cases(short), \ __unconst_integer_type_cases(int), \ __unconst_integer_type_cases(long), \ __unconst_integer_type_cases(long long), \ default: (x))) /* * Do not check the array parameter using __must_be_array(). * In the following legit use-case where the "array" passed is a simple pointer, * __must_be_array() will return a failure. * --- 8< --- * int *buff * ... * min = min_array(buff, nb_items); * --- 8< --- * * The first typeof(&(array)[0]) is needed in order to support arrays of both * 'int *buff' and 'int buff[N]' types. * * The array can be an array of const items. * typeof() keeps the const qualifier. Use __unconst_integer_typeof() in order * to discard the const qualifier for the __element variable. */ #define __minmax_array(op, array, len) ({ \ typeof(&(array)[0]) __array = (array); \ typeof(len) __len = (len); \ __unconst_integer_typeof(__array[0]) __element = __array[--__len]; \ while (__len--) \ __element = op(__element, __array[__len]); \ __element; }) /** * min_array - return minimum of values present in an array * @array: array * @len: array length * * Note that @len must not be zero (empty array). */ #define min_array(array, len) __minmax_array(min, array, len) /** * max_array - return maximum of values present in an array * @array: array * @len: array length * * Note that @len must not be zero (empty array). */ #define max_array(array, len) __minmax_array(max, array, len) /** * clamp_t - return a value clamped to a given range using a given type * @type: the type of variable to use * @val: current value * @lo: minimum allowable value * @hi: maximum allowable value * * This macro does no typechecking and uses temporary variables of type * @type to make all the comparisons. */ #define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) /** * clamp_val - return a value clamped to a given range using val's type * @val: current value * @lo: minimum allowable value * @hi: maximum allowable value * * This macro does no typechecking and uses temporary variables of whatever * type the input argument @val is. This is useful when @val is an unsigned * type and @lo and @hi are literals that will otherwise be assigned a signed * integer type. */ #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) static inline bool in_range64(u64 val, u64 start, u64 len) { return (val - start) < len; } static inline bool in_range32(u32 val, u32 start, u32 len) { return (val - start) < len; } /** * in_range - Determine if a value lies within a range. * @val: Value to test. * @start: First value in range. * @len: Number of values in range. * * This is more efficient than "if (start <= val && val < (start + len))". * It also gives a different answer if @start + @len overflows the size of * the type by a sufficient amount to encompass @val. Decide for yourself * which behaviour you want, or prove that start + len never overflow. * Do not blindly replace one form with the other. */ #define in_range(val, start, len) \ ((sizeof(start) | sizeof(len) | sizeof(val)) <= sizeof(u32) ? \ in_range32(val, start, len) : in_range64(val, start, len)) /** * swap - swap values of @a and @b * @a: first value * @b: second value */ #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) #endif /* _LINUX_MINMAX_H */ |
2745 2539 2540 2540 2618 2618 2618 2744 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 #include <linux/quotaops.h> #include <linux/uuid.h> #include "ext4.h" #include "xattr.h" #include "ext4_jbd2.h" static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, const struct fscrypt_name *src) { memset(dst, 0, sizeof(*dst)); dst->usr_fname = src->usr_fname; dst->disk_name = src->disk_name; dst->hinfo.hash = src->hash; dst->hinfo.minor_hash = src->minor_hash; dst->crypto_buf = src->crypto_buf; } int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); if (err) ext4_fname_free_filename(fname); #endif return err; } int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_prepare_lookup(dir, dentry, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); if (err) ext4_fname_free_filename(fname); #endif return err; } void ext4_fname_free_filename(struct ext4_filename *fname) { struct fscrypt_name name; name.crypto_buf = fname->crypto_buf; fscrypt_free_filename(&name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; #if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif } static bool uuid_is_zero(__u8 u[16]) { int i; for (i = 0; i < 16; i++) if (u[i]) return false; return true; } int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { struct super_block *sb = file_inode(filp)->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); int err, err2; handle_t *handle; if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); if (err) return err; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto pwsalt_err_exit; } err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); pwsalt_err_journal: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; pwsalt_err_exit: mnt_drop_write_file(filp); if (err) return err; } if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; int res, res2, credits, retries = 0; /* * Encrypting the root directory is not allowed because e2fsck expects * lost+found to exist and be unencrypted, and encrypting the root * directory would imply encrypting the lost+found directory as well as * the filename "lost+found" itself. */ if (inode->i_ino == EXT4_ROOT_INO) return -EPERM; if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EOPNOTSUPP; res = ext4_convert_inline_data(inode); if (res) return res; /* * If a journal handle was specified, then the encryption context is * being set on a new inode via inheritance and is part of a larger * transaction to create the inode. Otherwise the encryption context is * being set on an existing inode in its own transaction. Only in the * latter case should the "retry on ENOSPC" logic be used. */ if (handle) { res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); } return res; } res = dquot_initialize(inode); if (res) return res; retry: res = ext4_xattr_set_credits(inode, len, false /* is_create */, &credits); if (res) return res; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (!res) res = res2; return res; } static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) { return ext4_has_feature_stable_inodes(sb); } static void ext4_get_ino_and_lblk_bits(struct super_block *sb, int *ino_bits_ret, int *lblk_bits_ret) { *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); } const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; |
15 15 15 15 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/namespace.c * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc. */ #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/rcupdate.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include "util.h" /* * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. */ static void free_ipc(struct work_struct *unused); static DECLARE_WORK(free_ipc_work, free_ipc); static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); } static void dec_ipc_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); } static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; again: ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) { /* * IPC namespaces are freed asynchronously, by free_ipc_work. * If frees were pending, flush_work will wait, and * return true. Fail the allocation if no frees are pending. */ if (flush_work(&free_ipc_work)) goto again; goto fail; } err = -ENOMEM; ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ns.ops = &ipcns_operations; refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; err = mq_init_ns(ns); if (err) goto fail_put; err = -ENOMEM; if (!setup_mq_sysctls(ns)) goto fail_put; if (!setup_ipc_sysctls(ns)) goto fail_mq; err = msg_init_ns(ns); if (err) goto fail_put; sem_init_ns(ns); shm_init_ns(ns); return ns; fail_mq: retire_mq_sysctls(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); fail_free: kfree(ns); fail_dec: dec_ipc_namespaces(ucounts); fail: return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); return create_ipc_ns(user_ns, ns); } /* * free_ipcs - free all ipcs of one type * @ns: the namespace to remove the ipcs from * @ids: the table of ipcs to free * @free: the function called to free each individual ipc * * Called for each kind of ipc when an ipc_namespace exits. */ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; int next_id; int total, in_use; down_write(&ids->rwsem); in_use = ids->in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); total++; } up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { /* * Caller needs to wait for an RCU grace period to have passed * after making the mount point inaccessible to new accesses. */ mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); retire_mq_sysctls(ns); retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } static LLIST_HEAD(free_ipc_list); static void free_ipc(struct work_struct *unused) { struct llist_node *node = llist_del_all(&free_ipc_list); struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) mnt_make_shortterm(n->mq_mnt); /* Wait for any last users to have gone away. */ synchronize_rcu(); llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * * If this is the last task in the namespace exiting, and * it is dropping the refcount to 0, then it can race with * a task in another ipc namespace but in a mounts namespace * which has this ipcns's mqueuefs mounted, doing some action * with one of the mqueuefs files. That can raise the refcount. * So dropping the refcount, and raising the refcount when * accessing it through the VFS, are protected with mq_lock. * * (Clearly, a task raising the refcount on its own ipc_ns * needn't take mq_lock since it can't race with the last task * in the ipcns exiting). */ void put_ipc_ns(struct ipc_namespace *ns) { if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) { return container_of(ns, struct ipc_namespace, ns); } static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); task_unlock(task); return ns ? &ns->ns : NULL; } static void ipcns_put(struct ns_common *ns) { return put_ipc_ns(to_ipc_ns(ns)); } static int ipcns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; } static struct user_namespace *ipcns_owner(struct ns_common *ns) { return to_ipc_ns(ns)->user_ns; } const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, .owner = ipcns_owner, }; |
149 195 173 47 47 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_NULLS_H #define _LINUX_RCULIST_NULLS_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list_nulls.h> #include <linux/rcupdate.h> /** * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_nulls_add_head_rcu() or * hlist_nulls_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * hlist_nulls_first_rcu - returns the first element of the hash list. * @head: the head of the list. */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) /** * hlist_nulls_next_rcu - returns the element of the list after @node. * @node: element of the list. */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry(). */ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; !is_a_nulls(i); i = i->next) last = i; if (last) { WRITE_ONCE(n->next, last->next); n->pprev = &last->next; rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } } /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { n->pprev = &n->next; n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) /** * hlist_nulls_for_each_entry_safe - * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif |
67 3 28 28 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | // SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/perf_event.h> #include <linux/bug.h> #include <linux/stddef.h> #include <asm/perf_regs.h> #include <asm/ptrace.h> #ifdef CONFIG_X86_32 #define PERF_REG_X86_MAX PERF_REG_X86_32_MAX #else #define PERF_REG_X86_MAX PERF_REG_X86_64_MAX #endif #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { PT_REGS_OFFSET(PERF_REG_X86_AX, ax), PT_REGS_OFFSET(PERF_REG_X86_BX, bx), PT_REGS_OFFSET(PERF_REG_X86_CX, cx), PT_REGS_OFFSET(PERF_REG_X86_DX, dx), PT_REGS_OFFSET(PERF_REG_X86_SI, si), PT_REGS_OFFSET(PERF_REG_X86_DI, di), PT_REGS_OFFSET(PERF_REG_X86_BP, bp), PT_REGS_OFFSET(PERF_REG_X86_SP, sp), PT_REGS_OFFSET(PERF_REG_X86_IP, ip), PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), PT_REGS_OFFSET(PERF_REG_X86_CS, cs), PT_REGS_OFFSET(PERF_REG_X86_SS, ss), #ifdef CONFIG_X86_32 PT_REGS_OFFSET(PERF_REG_X86_DS, ds), PT_REGS_OFFSET(PERF_REG_X86_ES, es), PT_REGS_OFFSET(PERF_REG_X86_FS, fs), PT_REGS_OFFSET(PERF_REG_X86_GS, gs), #else /* * The pt_regs struct does not store * ds, es, fs, gs in 64 bit mode. */ (unsigned int) -1, (unsigned int) -1, (unsigned int) -1, (unsigned int) -1, #endif #ifdef CONFIG_X86_64 PT_REGS_OFFSET(PERF_REG_X86_R8, r8), PT_REGS_OFFSET(PERF_REG_X86_R9, r9), PT_REGS_OFFSET(PERF_REG_X86_R10, r10), PT_REGS_OFFSET(PERF_REG_X86_R11, r11), PT_REGS_OFFSET(PERF_REG_X86_R12, r12), PT_REGS_OFFSET(PERF_REG_X86_R13, r13), PT_REGS_OFFSET(PERF_REG_X86_R14, r14), PT_REGS_OFFSET(PERF_REG_X86_R15, r15), #endif }; u64 perf_reg_value(struct pt_regs *regs, int idx) { struct x86_perf_regs *perf_regs; if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { perf_regs = container_of(regs, struct x86_perf_regs, regs); if (!perf_regs->xmm_regs) return 0; return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; } if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } #define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \ ~((1ULL << PERF_REG_X86_MAX) - 1)) #ifdef CONFIG_X86_32 #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ (1ULL << PERF_REG_X86_R9) | \ (1ULL << PERF_REG_X86_R10) | \ (1ULL << PERF_REG_X86_R11) | \ (1ULL << PERF_REG_X86_R12) | \ (1ULL << PERF_REG_X86_R13) | \ (1ULL << PERF_REG_X86_R14) | \ (1ULL << PERF_REG_X86_R15)) int perf_reg_validate(u64 mask) { if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) return -EINVAL; return 0; } u64 perf_reg_abi(struct task_struct *task) { return PERF_SAMPLE_REGS_ABI_32; } void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); } #else /* CONFIG_X86_64 */ #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ (1ULL << PERF_REG_X86_ES) | \ (1ULL << PERF_REG_X86_FS) | \ (1ULL << PERF_REG_X86_GS)) int perf_reg_validate(u64 mask) { if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) return -EINVAL; return 0; } u64 perf_reg_abi(struct task_struct *task) { if (!user_64bit_mode(task_pt_regs(task))) return PERF_SAMPLE_REGS_ABI_32; else return PERF_SAMPLE_REGS_ABI_64; } static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); struct pt_regs *user_regs = task_pt_regs(current); if (!in_nmi()) { regs_user->regs = user_regs; regs_user->abi = perf_reg_abi(current); return; } /* * If we're in an NMI that interrupted task_pt_regs setup, then * we can't sample user regs at all. This check isn't really * sufficient, though, as we could be in an NMI inside an interrupt * that happened during task_pt_regs setup. */ if (regs->sp > (unsigned long)&user_regs->r11 && regs->sp <= (unsigned long)(user_regs + 1)) { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; return; } /* * These registers are always saved on 64-bit syscall entry. * On 32-bit entry points, they are saved too except r8..r11. */ regs_user_copy->ip = user_regs->ip; regs_user_copy->ax = user_regs->ax; regs_user_copy->cx = user_regs->cx; regs_user_copy->dx = user_regs->dx; regs_user_copy->si = user_regs->si; regs_user_copy->di = user_regs->di; regs_user_copy->r8 = user_regs->r8; regs_user_copy->r9 = user_regs->r9; regs_user_copy->r10 = user_regs->r10; regs_user_copy->r11 = user_regs->r11; regs_user_copy->orig_ax = user_regs->orig_ax; regs_user_copy->flags = user_regs->flags; regs_user_copy->sp = user_regs->sp; regs_user_copy->cs = user_regs->cs; regs_user_copy->ss = user_regs->ss; /* * Store user space frame-pointer value on sample * to facilitate stack unwinding for cases when * user space executable code has such support * enabled at compile time: */ regs_user_copy->bp = user_regs->bp; regs_user_copy->bx = -1; regs_user_copy->r12 = -1; regs_user_copy->r13 = -1; regs_user_copy->r14 = -1; regs_user_copy->r15 = -1; /* * For this to be at all useful, we need a reasonable guess for * the ABI. Be careful: we're in NMI context, and we're * considering current to be the current task, so we should * be careful not to look at any other percpu variables that might * change during context switches. */ regs_user->abi = user_64bit_mode(user_regs) ? PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; regs_user->regs = regs_user_copy; } #endif /* CONFIG_X86_32 */ |
877 877 876 877 877 877 877 877 877 2 2 1503 794 41 41 57 57 338 221 221 221 221 338 5 337 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 | /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *event_ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); } static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); if (master_ndev) dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); } |
23 23 23 23 23 23 23 1 22 23 23 23 14 37 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 | // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); free_percpu(dev->tstats); } static int xfrmi_create(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { dev->stats.rx_errors++; dev->stats.rx_dropped++; return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { stats->collisions++; net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { stats->tx_errors++; stats->tx_aborted_errors++; } return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); stats->tx_carrier_errors++; goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { stats->tx_carrier_errors++; goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: stats->tx_errors++; stats->tx_dropped++; kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return xi->p.link; } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = gro_cells_init(&xi->gro_cells, dev); if (err) { free_percpu(dev->tstats); return err; } dev->features |= NETIF_F_LLTX; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; if (xi->p.collect_md) { NL_SET_ERR_MSG(extack, "device can't be changed to collect_md"); return -EINVAL; } } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return xi->net; } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) { struct net *net; LIST_HEAD(list); rtnl_lock(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, &list); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, &list); } unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations xfrmi_net_ops = { .exit_batch = xfrmi_exit_batch_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface"); |
4 4 4 4 2 4 4 4 4 3 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blktrace_api.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/debugfs.h> #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> #include <linux/list.h> #include <linux/blk-cgroup.h> #include "../../block/blk.h" #include <trace/events/block.h> #include "trace_output.h" #ifdef CONFIG_BLK_DEV_IO_TRACE static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 #define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; static struct tracer_flags blk_tracer_flags = { .val = 0, .opts = blk_tracer_opts, }; /* Global reference count of probes */ static DEFINE_MUTEX(blk_probe_mutex); static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); /* * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); goto record_it; } if (!bt->rchan) return; t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; if (cgid_len) memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); } } /* * Send out a notify for this process, if we haven't done so since a trace * started */ static void trace_note_tsk(struct task_struct *tsk) { unsigned long flags; struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) { struct timespec64 now; unsigned long flags; u32 words[2]; /* need to check user space to see if this breaks in y2038 or y2106 */ ktime_get_real_ts64(&now); words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } void __blk_trace_note_message(struct blk_trace *bt, struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) return; /* * If the BLK_TC_NOTIFY action mask isn't set, don't send any note * message to the trace. */ if (!(bt->act_mask & BLK_TC_NOTIFY)) return; local_irq_save(flags); buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); #ifdef CONFIG_BLK_CGROUP if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) cgid = cgroup_id(css->cgroup); else cgid = 1; #endif trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; return 0; } /* * Data direction bit lookup */ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; #define BLK_TC_RAHEAD BLK_TC_AHEAD #define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, const blk_opf_t opf, u32 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; pid_t pid; int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[op_is_write(op) ? WRITE : READ]; what |= MASK_TC_BIT(opf, SYNC); what |= MASK_TC_BIT(opf, RAHEAD); what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); if (cgid) what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len + cgid_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); goto record_it; } if (unlikely(tsk->btrace_seq != blktrace_seq)) trace_note_tsk(tsk); /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); record_it: /* * These two are not needed in ftrace as they are in the * generic trace_entry, filled by tracing_generic_entry_update, * but for the trace_event->bin() synthesizer benefit we do it * here too. */ t->cpu = cpu; t->pid = pid; t->sector = sector; t->bytes = bytes; t->action = what; t->device = bt->dev; t->error = error; t->pdu_len = pdu_len + cgid_len; if (cgid_len) memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } } local_irq_restore(flags); } static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); /* * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created * under 'q->debugfs_dir', thus lookup and remove them. */ if (!bt->dir) { debugfs_lookup_and_remove("dropped", q->debugfs_dir); debugfs_lookup_and_remove("msg", q->debugfs_dir); } else { debugfs_remove(bt->dir); } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); } static void get_probe_ref(void) { mutex_lock(&blk_probe_mutex); if (++blk_probes_ref == 1) blk_register_tracepoints(); mutex_unlock(&blk_probe_mutex); } static void put_probe_ref(void) { mutex_lock(&blk_probe_mutex); if (!--blk_probes_ref) blk_unregister_tracepoints(); mutex_unlock(&blk_probe_mutex); } static int blk_trace_start(struct blk_trace *bt) { if (bt->trace_state != Blktrace_setup && bt->trace_state != Blktrace_stopped) return -EINVAL; blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); return 0; } static int blk_trace_stop(struct blk_trace *bt) { if (bt->trace_state != Blktrace_running) return -EINVAL; bt->trace_state = Blktrace_stopped; raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); return 0; } static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); } static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; blk_trace_cleanup(q, bt); return 0; } int blk_trace_remove(struct request_queue *q) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; char buf[16]; snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { char *msg; struct blk_trace *bt; if (count >= BLK_TN_MAX_MSG) return -EINVAL; msg = memdup_user_nul(buffer, count); if (IS_ERR(msg)) return PTR_ERR(msg); bt = filp->private_data; __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; } static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; /* * Keep track of how many times we encountered a full subbuffer, to aid * the user space app in telling how many lost events there were. */ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, size_t prev_padding) { struct blk_trace *bt; if (!relay_buf_full(buf)) return 1; bt = buf->chan->private_data; atomic_inc(&bt->dropped); return 0; } static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); return 0; } static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, umode_t mode, struct rchan_buf *buf, int *is_global) { return debugfs_create_file(filename, mode, parent, buf, &relay_file_operations); } static const struct rchan_callbacks blk_relay_callbacks = { .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; static void blk_trace_setup_lba(struct blk_trace *bt, struct block_device *bdev) { if (bdev) { bt->start_lba = bdev->bd_start_sect; bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; } } /* * Setup everything required to start tracing */ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; int ret; lockdep_assert_held(&q->debugfs_mutex); if (!buts->buf_size || !buts->buf_nr) return -EINVAL; strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; /* * some device names have larger paths - convert the slashes * to underscores for this to work as expected */ strreplace(buts->name, '/', '_'); /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); if (!bt->sequence) goto err; bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto err; /* * When tracing the whole disk reuse the existing debugfs directory * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory * is required, contrary to the usual mantra of not checking for debugfs * files or directories. */ if (IS_ERR_OR_NULL(dir)) { pr_warn("debugfs_dir not present for %s so skipping\n", buts->name); ret = -ENOENT; goto err; } bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); if (!bt->rchan) goto err; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; if (buts->end_lba) bt->end_lba = buts->end_lba; bt->pid = buts->pid; bt->trace_state = Blktrace_setup; rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); ret = 0; err: if (ret) blk_trace_free(q, bt); return ret; } static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { __blk_trace_remove(q); return -EFAULT; } return 0; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; struct compat_blk_user_trace_setup cbuts; int ret; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; buts = (struct blk_user_trace_setup) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, .start_lba = cbuts.start_lba, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { __blk_trace_remove(q); return -EFAULT; } return 0; } #endif static int __blk_trace_startstop(struct request_queue *q, int start) { struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; if (start) return blk_trace_start(bt); else return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_startstop); /* * When reading or writing the blktrace sysfs files, the references to the * opened sysfs or device files should prevent the underlying block device * from being removed. So no further delete protection is really needed. */ /** * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any * **/ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) { struct request_queue *q = bdev_get_queue(bdev); int ret, start = 0; char b[BDEVNAME_SIZE]; mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: snprintf(b, sizeof(b), "%pg", bdev); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif case BLKTRACESTART: start = 1; fallthrough; case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; break; } mutex_unlock(&q->debugfs_mutex); return ret; } /** * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device * **/ void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); } #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; blkcg_css = bio_blkcg_css(bio); if (!blkcg_css) return 0; return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } #endif static u64 blk_trace_request_get_cgid(struct request *rq) { if (!rq->bio) return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(rq->q, rq->bio); } /* * blktrace probes */ /** * blk_add_trace_rq - Add a trace for a request oriented action * @rq: the source request * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); else what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, what, blk_status_to_errno(error), 0, NULL, cgid); rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_merge(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_status_t error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, blk_trace_request_get_cgid(rq)); } /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for * @bio: the source bio * @what: the action * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, blk_status_to_errno(bio->bi_status)); } static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; if (explicit) what = BLK_TA_UNPLUG_IO; else what = BLK_TA_UNPLUG_TIMER; __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, blk_status_to_errno(bio->bi_status), sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } rcu_read_unlock(); } /** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @bio: the source bio * @dev: source device * @from: source sector * * Called after a bio is remapped to a different device and/or sector. **/ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_REMAP, blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation * @ignore: trace callback data parameter (not used) * @rq: the source request * @dev: target device * @from: source sector * * Description: * Device mapper remaps request to other devices. * Add a trace for that action. * **/ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } /** * blk_add_driver_data - Add binary message with driver-specific data * @rq: io request * @data: driver-specific data * @len: length of driver-specific data * * Description: * Some drivers might want to write driver-specific data per request. * **/ void blk_add_driver_data(struct request *rq, void *data, size_t len) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); static void blk_register_tracepoints(void) { int ret; ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); WARN_ON(ret); ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); WARN_ON(ret); ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); } static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); tracepoint_synchronize_unregister(); } /* * struct blk_io_tracer formatting routines */ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } if (tc & BLK_TC_FLUSH) rwbs[i++] = 'F'; if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; if (tc & BLK_TC_FUA) rwbs[i++] = 'F'; if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; if (tc & BLK_TC_META) rwbs[i++] = 'M'; out: rwbs[i] = '\0'; } static inline const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) { return (const struct blk_io_trace *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } static inline u64 t_cgid(const struct trace_entry *ent) { return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) { return te_blk_io_trace(ent)->action; } static inline u32 t_bytes(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes; } static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; } static inline unsigned long long t_sector(const struct trace_entry *ent) { return te_blk_io_trace(ent)->sector; } static inline __u16 t_error(const struct trace_entry *ent) { return te_blk_io_trace(ent)->error; } static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { const __be64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, bool has_cg); static void blk_log_action_classic(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); trace_seq_printf(&iter->seq, "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", MAJOR(t->device), MINOR(t->device), iter->cpu, secs, nsec_rem, iter->ent->pid, act, rwbs); } static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; cgroup_path_from_kernfs_id(id, blkcg_name_buf, sizeof(blkcg_name_buf)); trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); } else { /* * The cgid portion used to be "INO,GEN". Userland * builds a FILEID_INO32_GEN fid out of them and * opens the cgroup using open_by_handle_at(2). * While 32bit ino setups are still the same, 64bit * ones now use the 64bit ino as the whole ID and * no longer use generation. * * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry(). */ trace_seq_printf(&iter->seq, "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), id & U32_MAX, id >> 32, act, rwbs); } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); } static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; pdu_buf = pdu_start(ent, has_cg); pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; /* find the last zero that needs to be printed */ for (end = pdu_len - 1; end >= 0; end--) if (pdu_buf[end]) break; end++; trace_seq_putc(s, '('); for (i = 0; i < pdu_len; i++) { trace_seq_printf(s, "%s%02x", i == 0 ? "" : " ", pdu_buf[i]); /* * stop when the rest is just zeros and indicate so * with a ".." appended */ if (i == end && end != pdu_len - 1) { trace_seq_puts(s, " ..) "); return; } } trace_seq_puts(s, ") "); } static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) trace_seq_printf(s, "%llu + %u [%s]\n", t_sector(ent), t_sec(ent), cmd); else trace_seq_printf(s, "[%s]\n", cmd); } } static void blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), t_sec(ent), t_error(ent)); else trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); } } static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(be32_to_cpu(__r->device_from)), MINOR(be32_to_cpu(__r->device_from)), be64_to_cpu(__r->sector_from)); } static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "[%s]\n", cmd); } static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), get_pdu_int(ent, has_cg), cmd); } static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { trace_seq_putmem(s, pdu_start(ent, has_cg), pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } /* * struct tracer operations */ static void blk_tracer_print_header(struct seq_file *m) { if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return; seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" "# | | | | | |\n"); } static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; } static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); return 0; } static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; } static void blk_tracer_reset(struct trace_array *tr) { blk_tracer_stop(tr); } static const struct { const char *act[2]; void (*print)(struct trace_seq *s, const struct trace_entry *ent, bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; static enum print_line_t print_one_line(struct trace_iterator *iter, bool classic) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; bool long_act; blk_log_action_t *log_action; bool has_cg; t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { log_action(iter, long_act ? "message" : "m", has_cg); blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { log_action(iter, what2act[what].act[long_act], has_cg); what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags, struct trace_event *event) { return print_one_line(iter, false); } static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; const int offset = offsetof(struct blk_io_trace, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, }; trace_seq_putmem(s, &old, offset); trace_seq_putmem(s, &t->sector, sizeof(old) - offset + t->pdu_len); } static enum print_line_t blk_trace_event_print_binary(struct trace_iterator *iter, int flags, struct trace_event *event) { blk_trace_synthesize_old_trace(iter); return trace_handle_return(&iter->seq); } static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { if ((iter->ent->type != TRACE_BLK) || !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; return print_one_line(iter, true); } static int blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; else tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; } return 0; } static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, .reset = blk_tracer_reset, .start = blk_tracer_start, .stop = blk_tracer_stop, .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, .set_flag = blk_tracer_set_flag, }; static struct trace_event_functions trace_blk_event_funcs = { .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; static struct trace_event trace_blk_event = { .type = TRACE_BLK, .funcs = &trace_blk_event_funcs, }; static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } return 0; } device_initcall(init_blk_tracer); static int blk_trace_remove_queue(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); blk_trace_free(q, bt); return 0; } /* * Setup everything required to start tracing */ static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto free_bt; bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; blk_trace_setup_lba(bt, bdev); rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); return 0; free_bt: blk_trace_free(q, bt); return ret; } /* * sysfs interface to enable and configure tracing */ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf); static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); #define BLK_TRACE_DEVICE_ATTR(_name) \ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ sysfs_blk_trace_attr_show, \ sysfs_blk_trace_attr_store) static BLK_TRACE_DEVICE_ATTR(enable); static BLK_TRACE_DEVICE_ATTR(act_mask); static BLK_TRACE_DEVICE_ATTR(pid); static BLK_TRACE_DEVICE_ATTR(start_lba); static BLK_TRACE_DEVICE_ATTR(end_lba); static struct attribute *blk_trace_attrs[] = { &dev_attr_enable.attr, &dev_attr_act_mask.attr, &dev_attr_pid.attr, &dev_attr_start_lba.attr, &dev_attr_end_lba.attr, NULL }; struct attribute_group blk_trace_attr_group = { .name = "trace", .attrs = blk_trace_attrs, }; static const struct { int mask; const char *str; } mask_maps[] = { { BLK_TC_READ, "read" }, { BLK_TC_WRITE, "write" }, { BLK_TC_FLUSH, "flush" }, { BLK_TC_SYNC, "sync" }, { BLK_TC_QUEUE, "queue" }, { BLK_TC_REQUEUE, "requeue" }, { BLK_TC_ISSUE, "issue" }, { BLK_TC_COMPLETE, "complete" }, { BLK_TC_FS, "fs" }, { BLK_TC_PC, "pc" }, { BLK_TC_NOTIFY, "notify" }, { BLK_TC_AHEAD, "ahead" }, { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, }; static int blk_trace_str2mask(const char *str) { int i; int mask = 0; char *buf, *s, *token; buf = kstrdup(str, GFP_KERNEL); if (buf == NULL) return -ENOMEM; s = strstrip(buf); while (1) { token = strsep(&s, ","); if (token == NULL) break; if (*token == '\0') continue; for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { if (strcasecmp(token, mask_maps[i].str) == 0) { mask |= mask_maps[i].mask; break; } } if (i == ARRAY_SIZE(mask_maps)) { mask = -EINVAL; break; } } kfree(buf); return mask; } static ssize_t blk_trace_mask2str(char *buf, int mask) { int i; char *p = buf; for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { if (mask & mask_maps[i].mask) { p += sprintf(p, "%s%s", (p == buf) ? "" : ",", mask_maps[i].str); } } *p++ = '\n'; return p - buf; } static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; ssize_t ret = -ENXIO; mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); return ret; } static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; if (count == 0) goto out; if (attr == &dev_attr_act_mask) { if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */ ret = blk_trace_str2mask(buf); if (ret < 0) goto out; value = ret; } } else { if (kstrtoull(buf, 0, &value)) goto out; } mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } if (value) ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; } ret = 0; if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { if (attr == &dev_attr_act_mask) bt->act_mask = value; else if (attr == &dev_attr_pid) bt->pid = value; else if (attr == &dev_attr_start_lba) bt->start_lba = value; else if (attr == &dev_attr_end_lba) bt->end_lba = value; } out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); out: return ret ? ret : count; } #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: * Maps each request operation and flag to a single character and fills the * buffer provided by the caller with resulting string. * **/ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) { int i = 0; if (opf & REQ_PREFLUSH) rwbs[i++] = 'F'; switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; case REQ_OP_SECURE_ERASE: rwbs[i++] = 'D'; rwbs[i++] = 'E'; break; case REQ_OP_FLUSH: rwbs[i++] = 'F'; break; case REQ_OP_READ: rwbs[i++] = 'R'; break; default: rwbs[i++] = 'N'; } if (opf & REQ_FUA) rwbs[i++] = 'F'; if (opf & REQ_RAHEAD) rwbs[i++] = 'A'; if (opf & REQ_SYNC) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; } EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */ |
15163 15162 15165 15165 15162 15164 15165 15171 15161 15168 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 | // SPDX-License-Identifier: GPL-2.0+ /* * Restartable sequences system call * * Copyright (C) 2015, Google, Inc., * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> * Copyright (C) 2015-2018, EfficiOS Inc., * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) /* * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing * per-cpu operations. * * It allows user-space to perform update operations on per-cpu data * without requiring heavy-weight atomic operations. * * Detailed algorithm of rseq user-space assembly sequences: * * init(rseq_cs) * cpu = TLS->rseq::cpu_id_start * [1] TLS->rseq::rseq_cs = rseq_cs * [start_ip] ---------------------------- * [2] if (cpu != TLS->rseq::cpu_id) * goto abort_ip; * [3] <last_instruction_in_cs> * [post_commit_ip] ---------------------------- * * The address of jump target abort_ip must be outside the critical * region, i.e.: * * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] * * Steps [2]-[3] (inclusive) need to be a sequence of instructions in * userspace that can handle being interrupted between any of those * instructions, and then resumed to the abort_ip. * * 1. Userspace stores the address of the struct rseq_cs assembly * block descriptor into the rseq_cs field of the registered * struct rseq TLS area. This update is performed through a single * store within the inline assembly instruction sequence. * [start_ip] * * 2. Userspace tests to check whether the current cpu_id field match * the cpu number loaded before start_ip, branching to abort_ip * in case of a mismatch. * * If the sequence is preempted or interrupted by a signal * at or after start_ip and before post_commit_ip, then the kernel * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return * ip to abort_ip before returning to user-space, so the preempted * execution resumes at abort_ip. * * 3. Userspace critical section final instruction before * post_commit_ip is the commit. The critical section is * self-terminating. * [post_commit_ip] * * 4. <success> * * On failure at [2], or if interrupted by preempt or signal delivery * between [1] and [3]: * * [abort_ip] * F1. <failure> */ static int rseq_update_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); unsafe_put_user(node_id, &rseq->node_id, efault_end); unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); trace_rseq_update(t); return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). */ if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; /* * Reset node_id to its initial state (0). */ if (put_user(node_id, &t->rseq->node_id)) return -EFAULT; /* * Reset mm_cid to its initial state (0). */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ return 0; } static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; u64 ptr; u32 __user *usig; u32 sig; int ret; #ifdef CONFIG_64BIT if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; if (rseq_cs->start_ip >= TASK_SIZE || rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || rseq_cs->abort_ip >= TASK_SIZE || rseq_cs->version > 0) return -EINVAL; /* Check for overflow. */ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; if (current->rseq_sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); return -EINVAL; } return 0; } static bool rseq_warn_flags(const char *str, u32 flags) { u32 test_flags; if (!flags) return false; test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); return true; } static int rseq_need_restart(struct task_struct *t, u32 cs_flags) { u32 flags, event_mask; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) return -EINVAL; /* Get thread flags. */ ret = get_user(flags, &t->rseq->flags); if (ret) return ret; if (rseq_warn_flags("rseq", flags)) return -EINVAL; /* * Load and clear event mask atomically with respect to * scheduler preemption. */ preempt_disable(); event_mask = t->rseq_event_mask; t->rseq_event_mask = 0; preempt_enable(); return !!event_mask; } static int clear_rseq_cs(struct task_struct *t) { /* * The rseq_cs field is set to NULL on preemption or signal * delivery on top of rseq assembly block, as well as on top * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT return put_user(0UL, &t->rseq->rseq_cs); #else if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif } /* * Unsigned comparison will be true when ip >= start_ip, and when * ip < start_ip + post_commit_offset. */ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) { return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } static int rseq_ip_fixup(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; int ret; ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; /* * Handle potentially not being within a critical section. * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; ret = clear_rseq_cs(t); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); return 0; } /* * This resume handler must always be executed between any of: * - preemption, * - signal delivery, * and return to user-space. * * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; if (unlikely(t->flags & PF_EXITING)) return; /* * regs is NULL if and only if the caller is in a syscall path. Skip * fixup and leave rseq_cs as is so that rseq_sycall() will detect and * kill a misbehaving userspace on debug kernels. */ if (regs) { ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) goto error; } if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; error: sig = ksig ? ksig->sig : 0; force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ void rseq_syscall(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; if (!t->rseq) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); } #endif /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; current->rseq_len = 0; return 0; } if (unlikely(flags)) return -EINVAL; if (current->rseq) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; } /* * If there was no rseq previously registered, ensure the provided rseq * is properly aligned, as communcated to user-space through the ELF * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * * In order to be valid, rseq_len is either the original rseq size, or * large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ rseq_set_notify_resume(current); return 0; } |
8 376 129 1959 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
248 220 231 29 29 203 165 165 2345 2958 2958 2592 2592 5771 5771 2874 2874 2561 2561 102 102 2543 2543 2874 2874 2561 2561 102 102 40 40 51 51 51 2551 2551 2551 113 158 84 84 197 85 37 84 85 4619 4619 4619 4619 4618 4619 4619 170 170 170 170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } return ext4_read_bh(bh, op_flags, NULL); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { return __ext4_sb_bread_gfp(sb, block, 0, 0); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(sbi, ~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_inodes_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than an hour has passed since the last superblock update, and * 2. More than 16MB have been written since the last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || (journal->j_flags & JBD2_UNMOUNT)) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone * else. Once this happens, any attempt to call mark_buffer_dirty() * (for example, by ext4_commit_super), will cause a kernel OOPS. * This is a kludge to prevent these oops until we can put in a proper * hook in del_gendisk() to inform the VFS and file system layers. */ static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); spin_lock(&sbi->s_md_lock); } spin_unlock(&sbi->s_md_lock); } /* * This writepage callback for write_cache_pages() * takes care of a few cases after page cleaning. * * write_cache_pages() already checks for dirty pages * and calls clear_page_dirty_for_io(), which we want, * to write protect the pages. * * However, we may have to redirty a page (see below.) */ static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); out: return AOP_WRITEPAGE_ACTIVATE; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; return write_cache_pages(mapping, &wbc, ext4_journalled_writepage_callback, jinode->i_transaction); } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (sb_rdonly(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before * ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) ext4_commit_super(sb); ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(sbi->s_journal_bdev); invalidate_bdev(sbi->s_journal_bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (EXT4_I(inode)->i_reserved_data_blocks) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT), offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* String parameter that allows empty argument */ #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_u32 ("resgid", Opt_resgid), fsparam_u32 ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; if (ctx->s_qf_names[qtype]) kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; kuid_t uid; kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) { ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", result.uint_32); return -EINVAL; } ctx->s_resuid = uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) { ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", result.uint_32); return -EINVAL; } ctx->s_resgid = gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, ¶m); if (param.string) kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *s_mount_opts = NULL; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), GFP_KERNEL); if (!s_mount_opts) return ret; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) goto out_free; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: if (fc) { ext4_fc_free(fc); kfree(fc); } kfree(s_mount_opts); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { int blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (blocksize < PAGE_SIZE) ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " "experimental mount option 'dioread_nolock' " "for blocksize < PAGE_SIZE"); } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_puts(seq, "\n"); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } #if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } #endif if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize == PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " "clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { int ret = PTR_ERR(sbi->s_chksum_driver); ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); sbi->s_chksum_driver = NULL; return ret; } /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: /* flush s_sb_upd_work before destroying the journal. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static void ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; ext4_hash_info_init(sb); err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simpfy code and avoid * stripe aligned allocation which will rarely successes. */ if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && sbi->s_stripe % sbi->s_cluster_ratio != 0) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; } if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); err = -EINVAL; goto failed_mount_wq; } if (ext4_has_feature_ea_inode(sb)) { sbi->s_ea_inode_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); err = -EINVAL; goto failed_mount_wq; } } } /* * Get the # of file system overhead blocks from the * superblock if present. */ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); /* ignore the precalculated value if it is ridiculous */ if (sbi->s_overhead > ext4_blocks_count(es)) sbi->s_overhead = 0; /* * If the bigalloc feature is not enabled recalculating the * overhead doesn't take long, so we might as well just redo * it to make sure we are using the correct value. */ if (!ext4_has_feature_bigalloc(sb)) sbi->s_overhead = 0; if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; } /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->rsv_conversion_wq = alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); err = -ENOMEM; goto failed_mount4; } /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. */ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); err = -EFSCORRUPTED; goto failed_mount4; } sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); err = -ENOMEM; goto failed_mount4; } err = ext4_setup_super(sb, es, sb_rdonly(sb)); if (err == -EROFS) { sb->s_flags |= SB_RDONLY; } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); if (test_opt(sb, BLOCK_VALIDITY)) { err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); goto failed_mount4a; } } ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); /* * Enable optimize_scan if number of groups is > threshold. This can be * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); else clear_opt2(sb, MB_OPTIMIZE_SCAN); } err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount5; } /* * We can only set up the journal commit callback once * mballoc is initialized */ if (sbi->s_journal) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; err = ext4_percpu_param_init(sbi); if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); err = -ENOMEM; goto failed_mount6; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; err = ext4_register_sysfs(sb); if (err) goto failed_mount7; err = ext4_init_orphan_info(sb); if (err) goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount9; } #endif /* CONFIG_QUOTA */ /* * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; /* * Update the checksum after updating free space/inode counters and * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect * checksum in the buffer cache until it is written out and * e2fsprogs programs trying to open a file system immediately * after it is mounted can fail. */ ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) goto failed_mount10; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); return 0; failed_mount10: ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount9: __maybe_unused ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { /* flush s_sb_upd_work before journal destroy. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); if (sbi->s_journal_bdev) { invalidate_bdev(sbi->s_journal_bdev); blkdev_put(sbi->s_journal_bdev, sb); } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi; const char *descr; int ret; sbi = ext4_alloc_sbi(sb); if (!sbi) return -ENOMEM; fc->s_fs_info = sbi; /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); sbi->s_sb_block = 1; /* Default super block location */ if (ctx->spec & EXT4_SPEC_s_sb_block) sbi->s_sb_block = ctx->s_sb_block; ret = __ext4_fill_super(fc, sb); if (ret < 0) goto free_sbi; if (sbi->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) descr = " journalled data mode"; else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) descr = " ordered data mode"; else descr = " writeback data mode"; } else descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " "Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); return 0; free_sbi: ext4_free_sbi(sbi); fc->s_fs_info = NULL; return ret; } static int ext4_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ext4_fill_super); } /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; if (test_opt(sb, DATA_ERR_ABORT)) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. */ journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; /* * Test for the existence of a valid inode on disk. Bad things * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); return journal_inode; } static int ext4_journal_bmap(journal_t *journal, sector_t *block) { struct ext4_map_blocks map; int ret; if (journal->j_inode == NULL) return 0; map.m_lblk = *block; map.m_len = 1; ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); if (ret <= 0) { ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); jbd2_journal_abort(journal, ret ? ret : -EIO); return ret; } *block = map.m_pblk; return 0; } static journal_t *ext4_open_inode_journal(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (IS_ERR(journal_inode)) return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; ext4_init_journal_params(sb, journal); return journal; } static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; /* see get_tree_bdev why this is needed and safe */ up_write(&sb->s_umount); bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, &fs_holder_ops); down_write(&sb->s_umount); if (IS_ERR(bdev)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); return ERR_CAST(bdev); } blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); errno = -EINVAL; goto out_bdev; } es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); errno = -EFSCORRUPTED; goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); errno = -EFSCORRUPTED; goto out_bh; } *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); return bdev; out_bh: brelse(bh); out_bdev: blkdev_put(bdev, sb); return ERR_PTR(errno); } static journal_t *ext4_open_dev_journal(struct super_block *sb, dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct block_device *journal_bdev; int errno = 0; journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); if (IS_ERR(journal_bdev)) return ERR_CAST(journal_bdev); journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); errno = -EINVAL; goto out_journal; } journal->j_private = sb; EXT4_SB(sb)->s_journal_bdev = journal_bdev; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: blkdev_put(journal_bdev, sb); return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, struct ext4_super_block *es, unsigned long journal_devnum) { journal_t *journal; unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); dev_t journal_dev; int err = 0; int really_read_only; int journal_dev_ro; if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { ext4_msg(sb, KERN_INFO, "external journal device major/minor " "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); if (journal_inum && journal_dev) { ext4_msg(sb, KERN_ERR, "filesystem has both journal inode and journal device!"); return -EINVAL; } if (journal_inum) { journal = ext4_open_inode_journal(sb, journal_inum); if (IS_ERR(journal)) return PTR_ERR(journal); } else { journal = ext4_open_dev_journal(sb, journal_dev); if (IS_ERR(journal)) return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; if (journal_dev_ro && !sb_rdonly(sb)) { ext4_msg(sb, KERN_ERR, "journal device read-only, try mounting with '-o ro'"); err = -EROFS; goto err_out; } /* * Are we loading a blank journal or performing recovery after a * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); err = -EROFS; goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); __le16 orig_state; bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); if (save && memcmp(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); changed = true; } kfree(save); orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); if (orig_state != es->s_state) changed = true; /* Write out restored error information to the superblock */ if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2; } } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); goto err_out; } EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { EXT4_SB(sb)->s_journal = NULL; jbd2_journal_destroy(journal); return err; } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); ext4_commit_super(sb); } if (!really_read_only && journal_inum && journal_inum != le32_to_cpu(es->s_journal_inum)) { es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } return 0; err_out: jbd2_journal_destroy(journal); return err; } /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *sbh = sbi->s_sbh; lock_buffer(sbh); /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system * read/only but we need to replay the journal; at that point, * for people who are east of GMT and who make their clock * tick in localtime for Windows bug-for-bug compatibility, * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(sbi, percpu_counter_sum_positive( &sbi->s_freeclusters_counter))); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &sbi->s_freeinodes_counter)); /* Copy error information to the on-disk superblock */ spin_lock(&sbi->s_error_lock); if (sbi->s_add_error_count > 0) { es->s_state |= cpu_to_le16(EXT4_ERROR_FS); if (!es->s_first_error_time && !es->s_first_error_time_hi) { __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); strncpy(es->s_first_error_func, sbi->s_first_error_func, sizeof(es->s_first_error_func)); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = cpu_to_le32(sbi->s_first_error_ino); es->s_first_error_block = cpu_to_le64(sbi->s_first_error_block); es->s_first_error_errcode = ext4_errno_to_code(sbi->s_first_error_code); } __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); strncpy(es->s_last_error_func, sbi->s_last_error_func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); es->s_last_error_errcode = ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been * started already */ if (!es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } spin_unlock(&sbi->s_error_lock); ext4_superblock_csum_set(sb); unlock_buffer(sbh); } static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; if (!sbh) return -EINVAL; if (block_device_ejected(sb)) return -ENODEV; ext4_update_super(sb); lock_buffer(sbh); /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(sbh)) { unlock_buffer(sbh); return -EIO; } if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the * USB device was yanked out. Or it could happen to * be a transient write error and maybe the block will * be remapped. Nothing we can do but to retry the * write and hope for the best. */ ext4_msg(sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } get_bh(sbh); /* Clear potential dirty bit if it was journalled update */ clear_buffer_dirty(sbh); sbh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); return -EIO; } return 0; } /* * Have we just finished recovery? If so, and if we are mounting (or * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es) { int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { if (journal != NULL) { ext4_error(sb, "Journal got removed while the fs was " "mounted!"); return -EFSCORRUPTED; } return 0; } jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || ext4_has_feature_orphan_present(sb))) { if (!ext4_orphan_file_empty(sb)) { ext4_error(sb, "Orphan file not empty on read-only fs."); err = -EFSCORRUPTED; goto out; } ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: jbd2_journal_unlock_updates(journal); return err; } /* * If we are mounting (or read-write remounting) a filesystem whose journal * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; if (!ext4_has_feature_journal(sb)) { ext4_error(sb, "Journal got removed while the fs was mounted!"); return -EFSCORRUPTED; } journal = EXT4_SB(sb)->s_journal; /* * Now check for any error status which may have been recorded in the * journal by a prior ext4_error() or ext4_abort() */ j_errno = jbd2_journal_errno(journal); if (j_errno) { char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); j_errno = ext4_commit_super(sb); if (j_errno) return j_errno; ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } return 0; } /* * Force the running and committing transactions to commit, * and wait on the commit. */ int ext4_force_commit(struct super_block *sb) { return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); /* * Data writeback is possible w/o journal transaction, so barrier must * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ if (sbi->s_journal) { target = jbd2_get_latest_transaction(sbi->s_journal); if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) needs_barrier = true; if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) ret = jbd2_log_wait_commit(sbi->s_journal, target); } } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } return ret; } /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean * state independently. It relies on upper layer to stop all data & metadata * modifications. */ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); /* * Don't clear the needs_recovery flag if we failed to * flush the journal. */ error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); out: if (journal) /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(journal); return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ static int ext4_unfreeze(struct super_block *sb) { if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); return 0; } /* * Structure to save mount options for ext4_remount's benefit */ struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; kuid_t s_resuid; kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; old_opts.s_min_batch_time = sbi->s_min_batch_time; old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { char *qf_name = get_qf_name(sb, sbi, i); old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { if (sbi->s_journal && sbi->s_journal->j_task->io_context) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; } ext4_apply_options(fc, sb); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " "during remount not supported; ignoring"); sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); err = -EINVAL; goto restore_opts; } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dioread_nolock"); err = -EINVAL; goto restore_opts; } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); err = -EINVAL; goto restore_opts; } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); err = -EINVAL; goto restore_opts; } if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition * readonly, and if so set the rdonly flag and then * mark the partition as valid again. */ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); if (sbi->s_journal) { /* * We let remount-ro finish even if marking fs * as clean failed... */ ext4_mark_recovery_complete(sb, es); } } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. */ for (g = 0; g < sbi->s_groups_count; g++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EFSBADCRC; goto restore_opts; } } /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. */ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " "umount/remount instead"); err = -EINVAL; goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ if (sbi->s_journal) { err = ext4_clear_journal_err(sb, es); if (err) goto restore_opts; } sbi->s_mount_state = (le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) goto restore_opts; sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA enable_quota = 1; #endif } } /* * Handle creation of system zone data early because it can fail. * Releasing of existing data is done when we are sure remount will * succeed. */ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { err = ext4_setup_system_zone(sb); if (err) goto restore_opts; } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; } } /* Release old quota file names */ for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); /* * Reinitialize lazy itable initialization thread based on * current settings */ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; first_not_zeroed = ext4_has_uninit_itable(sb); ext4_register_li_request(sb, first_not_zeroed); } if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return 0; restore_opts: /* * If there was a failing r/w to ro transition, we may need to * re-enable quota */ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { to_free[i] = get_qf_name(sb, sbi, i); rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } synchronize_rcu(); for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return err; } static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; fc->s_fs_info = EXT4_SB(sb); ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) return ret; ret = __ext4_remount(fc, sb); if (ret < 0) return ret; ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", ext4_quota_mode(sb)); return 0; } #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) { struct kqid qid; struct dquot *dquot; u64 limit; u64 curblock; qid = make_kqid_projid(projid); dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? (buf->f_blocks - curblock) : 0; } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } #endif static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - (ext4_r_blocks_count(es) + resv_blocks); if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); #endif return 0; } #ifdef CONFIG_QUOTA /* * Helper functions so that transaction is started before we acquire dqio_sem * to keep correct lock ordering of transaction > dqio_sem */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; struct inode *inode; inode = dquot_to_inode(dquot); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_acquire_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); return PTR_ERR(handle); } ret = dquot_release(dquot); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; if (ext4_is_quota_journalled(sb)) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { return dquot_mark_dquot_dirty(dquot); } } static int ext4_write_info(struct super_block *sb, int type) { int ret, err; handle_t *handle; /* Data block + inode block */ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); /* The first argument of lockdep_set_subclass has to be * *exactly* the same as the argument to init_rwsem() --- in * this case, in init_once() --- or lockdep gets unhappy * because the name of the lock is set using the * stringification of the argument to init_rwsem(). */ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, subclass); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; if (!test_opt(sb, QUOTA)) return -EINVAL; /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) return -EXDEV; /* Quota already enabled for this file? */ if (IS_NOQUOTA(d_inode(path->dentry))) return -EBUSY; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; } else { /* * Clear the flag just in case mount options changed since * last time. */ sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; /* * Set inode flags to prevent userspace from messing with quota * files. If this fails, we return success anyway since quotas * are already enabled and this is not a hard failure. */ inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto unlock_inode; EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); if (err) dquot_quota_off(sb, type); } if (err) lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); return err; } static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) { switch (type) { case USRQUOTA: return qf_inum == EXT4_USR_QUOTA_INO; case GRPQUOTA: return qf_inum == EXT4_GRP_QUOTA_INO; case PRJQUOTA: return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; default: BUG(); } } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { int err; struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; if (!ext4_check_quota_inum(type, qf_inums[type])) { ext4_error(sb, "Bad quota inum: %lu, type: %d", qf_inums[type], type); return -EUCLEAN; } qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode: %lu, type: %d", qf_inums[type], type); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); return err; } /* Enable usage tracking for all quota types. */ int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; bool quota_mopt[EXT4_MAXQUOTAS] = { test_opt(sb, USRQUOTA), test_opt(sb, GRPQUOTA), test_opt(sb, PRJQUOTA), }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); ext4_quotas_off(sb, type); return err; } } } return 0; } static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; /* * When the filesystem was remounted read-only first, we cannot cleanup * inode flags here. Bad luck but people should be using QUOTA feature * these days anyway. */ if (sb_rdonly(sb)) goto out_put; inode_lock(inode); /* * Update modification times of quota files when userspace can * start looking at them. If we fail, we return success anyway since * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_unlock; } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); out_put: lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: return dquot_quota_off(sb, type); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else memcpy(data, bh->b_data+offset, tocopy); brelse(bh); offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile (we know the transaction is already started and has * enough credits) */ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } /* * Since we account only one data block in transaction credits, * then it is impossible to cross a block boundary. */ if (sb->s_blocksize - offset < len) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because not block aligned", (unsigned long long)off, (unsigned long long)len); return -EIO; } do { bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_page(bh->b_page); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; } return err ? err : len; } #endif #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext2 (%d)\n", err); } static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext3 (%d)\n", err); } static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } static inline int ext3_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; if (!ext4_has_feature_journal(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; kill_block_super(sb); if (journal_bdev) blkdev_put(journal_bdev, sb); } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; static int __init ext4_init_fs(void) { int i, err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) init_waitqueue_head(&ext4__ioend_wq[i]); err = ext4_init_es(); if (err) return err; err = ext4_init_pending(); if (err) goto out7; err = ext4_init_post_read_processing(); if (err) goto out6; err = ext4_init_pageio(); if (err) goto out5; err = ext4_init_system_zone(); if (err) goto out4; err = ext4_init_sysfs(); if (err) goto out3; err = ext4_init_mballoc(); if (err) goto out2; err = init_inodecache(); if (err) goto out1; err = ext4_fc_init_dentry_cache(); if (err) goto out05; register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: unregister_as_ext2(); unregister_as_ext3(); ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: ext4_exit_mballoc(); out2: ext4_exit_sysfs(); out3: ext4_exit_system_zone(); out4: ext4_exit_pageio(); out5: ext4_exit_post_read_processing(); out6: ext4_exit_pending(); out7: ext4_exit_es(); return err; } static void __exit ext4_exit_fs(void) { ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) |
5 3 3 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 * algorithm to the closely similar existing slicing-by-4 algorithm. * * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was * subsequently included in the kernel, thus was re-licensed under the * GNU GPL v2. * * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> * Same crc32 function was used in 5 other places in the kernel. * I made one version, and deleted the others. * There are various incantations of crc32(). Some use a seed of 0 or ~0. * Some xor at the end with ~0. The generic crc32() function takes * seed as an argument, and doesn't xor at the end. Then individual * users can do whatever they need. * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ #include <linux/crc32.h> #include <linux/crc32poly.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include "crc32defs.h" #if CRC_LE_BITS > 8 # define tole(x) ((__force u32) cpu_to_le32(x)) #else # define tole(x) (x) #endif #if CRC_BE_BITS > 8 # define tobe(x) ((__force u32) cpu_to_be32(x)) #else # define tobe(x) (x) #endif #include "crc32table.h" MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>"); MODULE_DESCRIPTION("Various CRC32 calculations"); MODULE_LICENSE("GPL"); #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8 /* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 __pure crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) # define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \ t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255]) # define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \ t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255]) # else # define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) # define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \ t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255]) # define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \ t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255]) # endif const u32 *b; size_t rem_len; # ifdef CONFIG_X86 size_t i; # endif const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; # if CRC_LE_BITS != 32 const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; # endif u32 q; /* Align it */ if (unlikely((long)buf & 3 && len)) { do { DO_CRC(*buf++); } while ((--len) && ((long)buf)&3); } # if CRC_LE_BITS == 32 rem_len = len & 3; len = len >> 2; # else rem_len = len & 7; len = len >> 3; # endif b = (const u32 *)buf; # ifdef CONFIG_X86 --b; for (i = 0; i < len; i++) { # else for (--b; len; --len) { # endif q = crc ^ *++b; /* use pre increment for speed */ # if CRC_LE_BITS == 32 crc = DO_CRC4; # else crc = DO_CRC8; q = *++b; crc ^= DO_CRC4; # endif } len = rem_len; /* And the last few bytes */ if (len) { u8 *p = (u8 *)(b + 1) - 1; # ifdef CONFIG_X86 for (i = 0; i < len; i++) DO_CRC(*++p); /* use pre increment for speed */ # else do { DO_CRC(*++p); /* use pre increment for speed */ } while (--len); # endif } return crc; #undef DO_CRC #undef DO_CRC4 #undef DO_CRC8 } #endif /** * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II * CRC32/CRC32C * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other * uses, or the previous crc32/crc32c value if computing incrementally. * @p: pointer to buffer over which CRC32/CRC32C is run * @len: length of buffer @p * @tab: little-endian Ethernet table * @polynomial: CRC32/CRC32c LE polynomial */ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i < 8; i++) crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; crc = (crc >> 4) ^ tab[0][crc & 15]; crc = (crc >> 4) ^ tab[0][crc & 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; crc = (crc >> 8) ^ tab[0][crc & 255]; } # else crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } #if CRC_LE_BITS == 1 u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE); } u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE); } #else u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE); } u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); } #endif EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); /* * This multiplies the polynomials x and y modulo the given modulus. * This follows the "little-endian" CRC convention that the lsbit * represents the highest power of x, and the msbit represents x^0. */ static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus) { u32 product = x & 1 ? y : 0; int i; for (i = 0; i < 31; i++) { product = (product >> 1) ^ (product & 1 ? modulus : 0); x >>= 1; product ^= x & 1 ? y : 0; } return product; } /** * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient) * @len: The number of bytes. @crc is multiplied by x^(8*@len) * @polynomial: The modulus used to reduce the result to 32 bits. * * It's possible to parallelize CRC computations by computing a CRC * over separate ranges of a buffer, then summing them. * This shifts the given CRC by 8*len bits (i.e. produces the same effect * as appending len bytes of zero to the data), in time proportional * to log(len). */ static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len, u32 polynomial) { u32 power = polynomial; /* CRC of x^32 */ int i; /* Shift up to 32 bits in the simple linear way */ for (i = 0; i < 8 * (int)(len & 3); i++) crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0); len >>= 2; if (!len) return crc; for (;;) { /* "power" is x^(2^i), modulo the polynomial */ if (len & 1) crc = gf2_multiply(crc, power, polynomial); len >>= 1; if (!len) break; /* Square power, advancing to x^(2^(i+1)) */ power = gf2_multiply(power, power, polynomial); } return crc; } u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32_POLY_LE); } u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32C_POLY_LE); } EXPORT_SYMBOL(crc32_le_shift); EXPORT_SYMBOL(__crc32c_le_shift); /** * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for * other uses, or the previous crc32 value if computing incrementally. * @p: pointer to buffer over which CRC32 is run * @len: length of buffer @p * @tab: big-endian Ethernet table * @polynomial: CRC32 BE polynomial */ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], u32 polynomial) { #if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ << 24; for (i = 0; i < 8; i++) crc = (crc << 1) ^ ((crc & 0x80000000) ? polynomial : 0); } # elif CRC_BE_BITS == 2 while (len--) { crc ^= *p++ << 24; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; } # elif CRC_BE_BITS == 4 while (len--) { crc ^= *p++ << 24; crc = (crc << 4) ^ tab[0][crc >> 28]; crc = (crc << 4) ^ tab[0][crc >> 28]; } # elif CRC_BE_BITS == 8 while (len--) { crc ^= *p++ << 24; crc = (crc << 8) ^ tab[0][crc >> 24]; } # else crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } #if CRC_BE_BITS == 1 u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); } #else u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE); } #endif EXPORT_SYMBOL(crc32_be); |
230 230 230 230 411 411 24 24 10 24 441 64 445 44 5 442 41 442 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This abstraction represents an SCTP endpoint. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/random.h> /* get_random_bytes() */ #include <net/sock.h> #include <net/ipv6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); /* * Initialize the base fields of the endpoint structure. */ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { struct net *net = sock_net(sk); struct sctp_shared_key *null_key; ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); if (!ep->digest) return NULL; ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { if (sctp_auth_init(ep, gfp)) goto nomem; if (ep->asconf_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } } /* Initialize the base structure. */ /* What type of endpoint are we? */ ep->base.type = SCTP_EP_TYPE_SOCKET; /* Initialize the basic object fields. */ refcount_set(&ep->base.refcnt, 1); ep->base.dead = false; /* Create an input queue. */ sctp_inq_init(&ep->base.inqueue); /* Set its top-half handler */ sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv); /* Initialize the bind addr area */ sctp_bind_addr_init(&ep->base.bind_addr, 0); /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = net->sctp.sndbuf_policy; sk->sk_data_ready = sctp_data_ready; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; /* Initialize the secret key used with cookie. */ get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); null_key = sctp_auth_shkey_create(0, gfp); if (!null_key) goto nomem_shkey; list_add(&null_key->key_list, &ep->endpoint_shared_keys); /* Add the null key to the endpoint shared keys list and * set the hmcas and chunks pointers. */ ep->prsctp_enable = net->sctp.prsctp_enable; ep->reconf_enable = net->sctp.reconf_enable; ep->ecn_enable = net->sctp.ecn_enable; /* Remember who we are attached to. */ ep->base.sk = sk; ep->base.net = sock_net(sk); sock_hold(ep->base.sk); return ep; nomem_shkey: sctp_auth_free(ep); nomem: kfree(ep->digest); return NULL; } /* Create a sctp_endpoint with all that boring stuff initialized. * Returns NULL if there isn't enough memory. */ struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp) { struct sctp_endpoint *ep; /* Build a local endpoint. */ ep = kzalloc(sizeof(*ep), gfp); if (!ep) goto fail; if (!sctp_endpoint_init(ep, sk, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(ep); return ep; fail_init: kfree(ep); fail: return NULL; } /* Add an association to an endpoint. */ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, struct sctp_association *asoc) { struct sock *sk = ep->base.sk; /* If this is a temporary association, don't bother * since we'll be removing it shortly and don't * want anyone to find it anyway. */ if (asoc->temp) return; /* Now just add it to our list of asocs */ list_add_tail(&asoc->asocs, &ep->asocs); /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) sk_acceptq_added(sk); } /* Free the endpoint structure. Delay cleanup until * all users have released their reference count on this structure. */ void sctp_endpoint_free(struct sctp_endpoint *ep) { ep->base.dead = true; inet_sk_set_state(ep->base.sk, SCTP_SS_CLOSED); /* Unlink this endpoint, so we can't find it again! */ sctp_unhash_endpoint(ep); sctp_endpoint_put(ep); } /* Final destructor for endpoint. */ static void sctp_endpoint_destroy_rcu(struct rcu_head *head) { struct sctp_endpoint *ep = container_of(head, struct sctp_endpoint, rcu); struct sock *sk = ep->base.sk; sctp_sk(sk)->ep = NULL; sock_put(sk); kfree(ep); SCTP_DBG_OBJCNT_DEC(ep); } static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { struct sock *sk; if (unlikely(!ep->base.dead)) { WARN(1, "Attempt to destroy undead endpoint %p!\n", ep); return; } /* Free the digest buffer */ kfree(ep->digest); /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ sctp_auth_destroy_keys(&ep->endpoint_shared_keys); sctp_auth_free(ep); /* Cleanup. */ sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); memset(ep->secret_key, 0, sizeof(ep->secret_key)); sk = ep->base.sk; /* Remove and free the port */ if (sctp_sk(sk)->bind_hash) sctp_put_port(sk); call_rcu(&ep->rcu, sctp_endpoint_destroy_rcu); } /* Hold a reference to an endpoint. */ int sctp_endpoint_hold(struct sctp_endpoint *ep) { return refcount_inc_not_zero(&ep->base.refcnt); } /* Release a reference to an endpoint and clean up if there are * no more references. */ void sctp_endpoint_put(struct sctp_endpoint *ep) { if (refcount_dec_and_test(&ep->base.refcnt)) sctp_endpoint_destroy(ep); } /* Is this the endpoint we are looking for? */ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct net *net, const union sctp_addr *laddr, int dif, int sdif) { int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if); struct sctp_endpoint *retval = NULL; if (net_eq(ep->base.net, net) && sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && (htons(ep->base.bind_addr.port) == laddr->v4.sin_port)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; } return retval; } /* Find the association that goes with this chunk. * We lookup the transport from hashtable at first, then get association * through t->assoc. */ struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **transport) { struct sctp_association *asoc = NULL; struct sctp_transport *t; *transport = NULL; /* If the local port is not set, there can't be any associations * on this endpoint. */ if (!ep->base.bind_addr.port) return NULL; rcu_read_lock(); t = sctp_epaddr_lookup_transport(ep, paddr); if (!t) goto out; *transport = t; asoc = t->asoc; out: rcu_read_unlock(); return asoc; } /* Look for any peeled off association from the endpoint that matches the * given peer address. */ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if); struct sctp_sockaddr_entry *addr; struct net *net = ep->base.net; struct sctp_bind_addr *bp; bp = &ep->base.bind_addr; /* This function is called with the socket lock held, * so the address_list can not change. */ list_for_each_entry(addr, &bp->address_list, list) { if (sctp_has_association(net, &addr->a, paddr, bound_dev_if, bound_dev_if)) return true; } return false; } /* Do delayed input processing. This is scheduled by sctp_rcv(). * This may be called on BH or task time. */ static void sctp_endpoint_bh_rcv(struct work_struct *work) { struct sctp_endpoint *ep = container_of(work, struct sctp_endpoint, base.inqueue.immediate); struct sctp_association *asoc; struct sock *sk; struct net *net; struct sctp_transport *transport; struct sctp_chunk *chunk; struct sctp_inq *inqueue; union sctp_subtype subtype; enum sctp_state state; int error = 0; int first_time = 1; /* is this the first time through the loop */ if (ep->base.dead) return; asoc = NULL; inqueue = &ep->base.inqueue; sk = ep->base.sk; net = sock_net(sk); while (NULL != (chunk = sctp_inq_pop(inqueue))) { subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); /* If the first chunk in the packet is AUTH, do special * processing specified in Section 6.3 of SCTP-AUTH spec */ if (first_time && (subtype.chunk == SCTP_CID_AUTH)) { struct sctp_chunkhdr *next_hdr; next_hdr = sctp_inq_peek(inqueue); if (!next_hdr) goto normal; /* If the next chunk is COOKIE-ECHO, skip the AUTH * chunk while saving a pointer to it so we can do * Authentication later (during cookie-echo * processing). */ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) { chunk->auth_chunk = skb_clone(chunk->skb, GFP_ATOMIC); chunk->auth = 1; continue; } } normal: /* We might have grown an association since last we * looked, so try again. * * This happens when we've just processed our * COOKIE-ECHO chunk. */ if (NULL == chunk->asoc) { asoc = sctp_endpoint_lookup_assoc(ep, sctp_source(chunk), &transport); chunk->asoc = asoc; chunk->transport = transport; } state = asoc ? asoc->state : SCTP_STATE_CLOSED; if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) continue; /* Remember where the last DATA chunk came from so we * know where to send the SACK. */ if (asoc && sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS); if (asoc) asoc->stats.ictrlchunks++; } if (chunk->transport) chunk->transport->last_time_heard = ktime_get(); error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state, ep, asoc, chunk, GFP_ATOMIC); if (error && chunk) chunk->pdiscard = 1; /* Check to see if the endpoint is freed in response to * the incoming chunk. If so, get out of the while loop. */ if (!sctp_sk(sk)->ep) break; if (first_time) first_time = 0; } } |
1123 1123 1123 1123 1122 1122 1123 1123 1123 1123 1123 1123 1109 102 1123 1123 1109 1123 1123 1123 1123 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template_lib.c * Library of supported template fields. */ #include "ima_template_lib.h" #include <linux/xattr.h> #include <linux/evm.h> static bool ima_template_hash_algo_allowed(u8 algo) { if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5) return true; return false; } enum data_formats { DATA_FMT_DIGEST = 0, DATA_FMT_DIGEST_WITH_ALGO, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, DATA_FMT_STRING, DATA_FMT_HEX, DATA_FMT_UINT }; enum digest_type { DIGEST_TYPE_IMA, DIGEST_TYPE_VERITY, DIGEST_TYPE__LAST }; #define DIGEST_TYPE_NAME_LEN_MAX 7 /* including NUL */ static const char * const digest_type_name[DIGEST_TYPE__LAST] = { [DIGEST_TYPE_IMA] = "ima", [DIGEST_TYPE_VERITY] = "verity" }; static int ima_write_template_field_data(const void *data, const u32 datalen, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf, *buf_ptr; u32 buflen = datalen; if (datafmt == DATA_FMT_STRING) buflen = datalen + 1; buf = kzalloc(buflen, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, data, datalen); /* * Replace all space characters with underscore for event names and * strings. This avoid that, during the parsing of a measurements list, * filenames with spaces or that end with the suffix ' (deleted)' are * split into multiple template fields (the space is the delimitator * character for measurements lists in ASCII format). */ if (datafmt == DATA_FMT_STRING) { for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++) if (*buf_ptr == ' ') *buf_ptr = '_'; } field_data->data = buf; field_data->len = buflen; return 0; } static void ima_show_template_data_ascii(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf_ptr = field_data->data; u32 buflen = field_data->len; switch (datafmt) { case DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: case DATA_FMT_DIGEST_WITH_ALGO: buf_ptr = strrchr(field_data->data, ':'); if (buf_ptr != field_data->data) seq_printf(m, "%s", field_data->data); /* skip ':' and '\0' */ buf_ptr += 2; buflen -= buf_ptr - field_data->data; fallthrough; case DATA_FMT_DIGEST: case DATA_FMT_HEX: if (!buflen) break; ima_print_digest(m, buf_ptr, buflen); break; case DATA_FMT_STRING: seq_printf(m, "%s", buf_ptr); break; case DATA_FMT_UINT: switch (field_data->len) { case sizeof(u8): seq_printf(m, "%u", *(u8 *)buf_ptr); break; case sizeof(u16): if (ima_canonical_fmt) seq_printf(m, "%u", le16_to_cpu(*(__le16 *)buf_ptr)); else seq_printf(m, "%u", *(u16 *)buf_ptr); break; case sizeof(u32): if (ima_canonical_fmt) seq_printf(m, "%u", le32_to_cpu(*(__le32 *)buf_ptr)); else seq_printf(m, "%u", *(u32 *)buf_ptr); break; case sizeof(u64): if (ima_canonical_fmt) seq_printf(m, "%llu", le64_to_cpu(*(__le64 *)buf_ptr)); else seq_printf(m, "%llu", *(u64 *)buf_ptr); break; default: break; } break; default: break; } } static void ima_show_template_data_binary(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ? strlen(field_data->data) : field_data->len; if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) { u32 field_len = !ima_canonical_fmt ? len : (__force u32)cpu_to_le32(len); ima_putc(m, &field_len, sizeof(field_len)); } if (!len) return; ima_putc(m, field_data->data, len); } static void ima_show_template_field_data(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { switch (show) { case IMA_SHOW_ASCII: ima_show_template_data_ascii(m, show, datafmt, field_data); break; case IMA_SHOW_BINARY: case IMA_SHOW_BINARY_NO_FIELD_LEN: case IMA_SHOW_BINARY_OLD_STRING_FMT: ima_show_template_data_binary(m, show, datafmt, field_data); break; default: break; } } void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data); } void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO, field_data); } void ima_show_template_digest_ngv2(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, field_data); } void ima_show_template_string(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data); } void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_buf(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_uint(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_UINT, field_data); } /** * ima_parse_buf() - Parses lengths and data from an input buffer * @bufstartp: Buffer start address. * @bufendp: Buffer end address. * @bufcurp: Pointer to remaining (non-parsed) data. * @maxfields: Length of fields array. * @fields: Array containing lengths and pointers of parsed data. * @curfields: Number of array items containing parsed data. * @len_mask: Bitmap (if bit is set, data length should not be parsed). * @enforce_mask: Check if curfields == maxfields and/or bufcurp == bufendp. * @bufname: String identifier of the input buffer. * * Return: 0 on success, -EINVAL on error. */ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, int maxfields, struct ima_field_data *fields, int *curfields, unsigned long *len_mask, int enforce_mask, char *bufname) { void *bufp = bufstartp; int i; for (i = 0; i < maxfields; i++) { if (len_mask == NULL || !test_bit(i, len_mask)) { if (bufp > (bufendp - sizeof(u32))) break; if (ima_canonical_fmt) fields[i].len = le32_to_cpu(*(__le32 *)bufp); else fields[i].len = *(u32 *)bufp; bufp += sizeof(u32); } if (bufp > (bufendp - fields[i].len)) break; fields[i].data = bufp; bufp += fields[i].len; } if ((enforce_mask & ENFORCE_FIELDS) && i != maxfields) { pr_err("%s: nr of fields mismatch: expected: %d, current: %d\n", bufname, maxfields, i); return -EINVAL; } if ((enforce_mask & ENFORCE_BUFEND) && bufp != bufendp) { pr_err("%s: buf end mismatch: expected: %p, current: %p\n", bufname, bufendp, bufp); return -EINVAL; } if (curfields) *curfields = i; if (bufcurp) *bufcurp = bufp; return 0; } static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, u8 digest_type, u8 hash_algo, struct ima_field_data *field_data) { /* * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: <hash algo> + ':' + '\0' + digest, * - DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: * <digest type> + ':' + <hash algo> + ':' + '\0' + digest, * * where 'DATA_FMT_DIGEST' is the original digest format ('d') * with a hash size limitation of 20 bytes, * where <digest type> is either "ima" or "verity", * where <hash algo> is the hash_algo_name[] string. */ u8 buffer[DIGEST_TYPE_NAME_LEN_MAX + CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; enum data_formats fmt = DATA_FMT_DIGEST; u32 offset = 0; if (digest_type < DIGEST_TYPE__LAST && hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO; offset += 1 + sprintf(buffer, "%s:%s:", digest_type_name[digest_type], hash_algo_name[hash_algo]); } else if (hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_ALGO; offset += 1 + sprintf(buffer, "%s:", hash_algo_name[hash_algo]); } if (digest) memcpy(buffer + offset, digest, digestsize); else /* * If digest is NULL, the event being recorded is a violation. * Make room for the digest by increasing the offset by the * hash algorithm digest size. */ offset += hash_digest_size[hash_algo]; return ima_write_template_field_data(buffer, offset + digestsize, fmt, field_data); } /* * This function writes the digest of an event (with size limit). */ int ima_eventdigest_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct ima_max_digest_data hash; u8 *cur_digest = NULL; u32 cur_digestsize = 0; struct inode *inode; int result; memset(&hash, 0, sizeof(hash)); if (event_data->violation) /* recording a violation. */ goto out; if (ima_template_hash_algo_allowed(event_data->iint->ima_hash->algo)) { cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; goto out; } if ((const char *)event_data->filename == boot_aggregate_name) { if (ima_tpm_chip) { hash.hdr.algo = HASH_ALGO_SHA1; result = ima_calc_boot_aggregate(&hash.hdr); /* algo can change depending on available PCR banks */ if (!result && hash.hdr.algo != HASH_ALGO_SHA1) result = -EINVAL; if (result < 0) memset(&hash, 0, sizeof(hash)); } cur_digest = hash.hdr.digest; cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; goto out; } if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; inode = file_inode(event_data->file); hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ? ima_hash_algo : HASH_ALGO_SHA1; result = ima_calc_file_hash(event_data->file, &hash.hdr); if (result) { integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, event_data->filename, "collect_data", "failed", result, 0); return result; } cur_digest = hash.hdr.digest; cur_digestsize = hash.hdr.length; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, HASH_ALGO__LAST, field_data); } /* * This function writes the digest of an event (without size limit). */ int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } /* * This function writes the digest of an event (without size limit), * prefixed with both the digest type and hash algorithm. */ int ima_eventdigest_ngv2_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; u8 digest_type = DIGEST_TYPE_IMA; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; if (event_data->iint->flags & IMA_VERITY_REQUIRED) digest_type = DIGEST_TYPE_VERITY; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, digest_type, hash_algo, field_data); } /* * This function writes the digest of the file which is expected to match the * digest contained in the file's appended signature. */ int ima_eventdigest_modsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { enum hash_algo hash_algo; const u8 *cur_digest; u32 cur_digestsize; if (!event_data->modsig) return 0; if (event_data->violation) { /* Recording a violation. */ hash_algo = HASH_ALGO_SHA1; cur_digest = NULL; cur_digestsize = 0; } else { int rc; rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, &cur_digest, &cur_digestsize); if (rc) return rc; else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) /* There was some error collecting the digest. */ return -EINVAL; } return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) { const char *cur_filename = NULL; u32 cur_filename_len = 0; BUG_ON(event_data->filename == NULL && event_data->file == NULL); if (event_data->filename) { cur_filename = event_data->filename; cur_filename_len = strlen(event_data->filename); if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX) goto out; } if (event_data->file) { cur_filename = event_data->file->f_path.dentry->d_name.name; cur_filename_len = strlen(cur_filename); } else /* * Truncate filename if the latter is too long and * the file descriptor is not available. */ cur_filename_len = IMA_EVENT_NAME_LEN_MAX; out: return ima_write_template_field_data(cur_filename, cur_filename_len, DATA_FMT_STRING, field_data); } /* * This function writes the name of an event (with size limit). */ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, true); } /* * This function writes the name of an event (without size limit). */ int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, false); } /* * ima_eventsig_init - include the file signature as part of the template data */ int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_value = event_data->xattr_value; if (!xattr_value || (xattr_value->type != EVM_IMA_XATTR_DIGSIG && xattr_value->type != IMA_VERITY_DIGSIG)) return ima_eventevmsig_init(event_data, field_data); return ima_write_template_field_data(xattr_value, event_data->xattr_len, DATA_FMT_HEX, field_data); } /* * ima_eventbuf_init - include the buffer(kexec-cmldine) as part of the * template data. */ int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { if ((!event_data->buf) || (event_data->buf_len == 0)) return 0; return ima_write_template_field_data(event_data->buf, event_data->buf_len, DATA_FMT_HEX, field_data); } /* * ima_eventmodsig_init - include the appended file signature as part of the * template data */ int ima_eventmodsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { const void *data; u32 data_len; int rc; if (!event_data->modsig) return 0; /* * modsig is a runtime structure containing pointers. Get its raw data * instead. */ rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); if (rc) return rc; return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, field_data); } /* * ima_eventevmsig_init - include the EVM portable signature as part of the * template data */ int ima_eventevmsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_data = NULL; int rc = 0; if (!event_data->file) return 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)xattr_data, rc, DATA_FMT_HEX, field_data); out: kfree(xattr_data); return rc; } static int ima_eventinodedac_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool get_uid) { unsigned int id; if (!event_data->file) return 0; if (get_uid) id = i_uid_read(file_inode(event_data->file)); else id = i_gid_read(file_inode(event_data->file)); if (ima_canonical_fmt) { if (sizeof(id) == sizeof(u16)) id = (__force u16)cpu_to_le16(id); else id = (__force u32)cpu_to_le32(id); } return ima_write_template_field_data((void *)&id, sizeof(id), DATA_FMT_UINT, field_data); } /* * ima_eventinodeuid_init - include the inode UID as part of the template * data */ int ima_eventinodeuid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, true); } /* * ima_eventinodegid_init - include the inode GID as part of the template * data */ int ima_eventinodegid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, false); } /* * ima_eventinodemode_init - include the inode mode as part of the template * data */ int ima_eventinodemode_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct inode *inode; u16 mode; if (!event_data->file) return 0; inode = file_inode(event_data->file); mode = inode->i_mode; if (ima_canonical_fmt) mode = (__force u16)cpu_to_le16(mode); return ima_write_template_field_data((char *)&mode, sizeof(mode), DATA_FMT_UINT, field_data); } static int ima_eventinodexattrs_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, char type) { u8 *buffer = NULL; int rc; if (!event_data->file) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), NULL, 0, type, ima_canonical_fmt); if (rc < 0) return 0; buffer = kmalloc(rc, GFP_KERNEL); if (!buffer) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), buffer, rc, type, ima_canonical_fmt); if (rc < 0) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)buffer, rc, DATA_FMT_HEX, field_data); out: kfree(buffer); return rc; } /* * ima_eventinodexattrnames_init - include a list of xattr names as part of the * template data */ int ima_eventinodexattrnames_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'n'); } /* * ima_eventinodexattrlengths_init - include a list of xattr lengths as part of * the template data */ int ima_eventinodexattrlengths_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'l'); } /* * ima_eventinodexattrvalues_init - include a list of xattr values as part of * the template data */ int ima_eventinodexattrvalues_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'v'); } |
5753 27 1497 27 426 3 27 2206 2251 1771 220 5740 1899 4524 1869 381 167 1105 1201 986 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 | // SPDX-License-Identifier: GPL-2.0+ /* * ext4_jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1998--1999 Red Hat corp --- All Rights Reserved * * Ext4-specific journaling extensions. */ #ifndef _EXT4_JBD2_H #define _EXT4_JBD2_H #include <linux/fs.h> #include <linux/jbd2.h> #include "ext4.h" #define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) /* Define the number of blocks we need to account to a transaction to * modify one block of data. * * We may have to touch one inode, one bitmap buffer, up to three * indirection blocks, the group and superblock summaries, and the data * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to * 5 levels of tree, data block (for each of these we need bitmap + group * summaries), root which is stored in the inode, sb */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (ext4_has_feature_extents(sb) ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode * and the superblock, which are already accounted for. */ #define EXT4_XATTR_TRANS_BLOCKS 6U /* Define the minimum size for a transaction which modifies data. This * needs to take into account the fact that we may end up modifying two * quota files too (one for the group, one for the user quota). The * superblock only gets updated once, of course, so don't bother * counting that again for the quota updates. */ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. * * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always * start off at the maximum transaction size and grow the transaction * optimistically as we go. */ #define EXT4_MAX_TRANS_DATA 64U /* We break up a large truncate or write transaction once the handle's * buffer credits gets this low, we need either to extend the * transaction or to start a new one. Reserve enough space here for * inode, bitmap, superblock, group and indirection updates for at least * one block, plus two quota updates. Quota allocations are not * needed. */ #define EXT4_RESERVE_TRANS_BLOCKS 12U /* * Number of credits needed if we need to insert an entry into a * directory. For each new index block, we need 4 blocks (old index * block, new index block, bitmap block, bg summary). For normal * htree directories there are 2 levels; if the largedir feature * enabled it's 3 levels. */ #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) #define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) /* * Ext4 handle operation types -- for logging purposes */ #define EXT4_HT_MISC 0 #define EXT4_HT_INODE 1 #define EXT4_HT_WRITE_PAGE 2 #define EXT4_HT_MAP_BLOCKS 3 #define EXT4_HT_DIR 4 #define EXT4_HT_TRUNCATE 5 #define EXT4_HT_QUOTA 6 #define EXT4_HT_RESIZE 7 #define EXT4_HT_MIGRATE 8 #define EXT4_HT_MOVE_EXTENTS 9 #define EXT4_HT_XATTR 10 #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 /** * struct ext4_journal_cb_entry - Base structure for callback information. * * This struct is a 'seed' structure for a using with your own callback * structs. If you are using callbacks you must allocate one of these * or another struct of your own definition which has this struct * as it's first element and pass it to ext4_journal_callback_add(). */ struct ext4_journal_cb_entry { /* list information for other callbacks attached to the same handle */ struct list_head jce_list; /* Function to call with this callback structure */ void (*jce_func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error); /* user data goes here */ }; /** * ext4_journal_callback_add: add a function to call after transaction commit * @handle: active journal transaction handle to register callback on * @func: callback function to call after the transaction has committed: * @sb: superblock of current filesystem for transaction * @jce: returned journal callback data * @rc: journal state at commit (0 = transaction committed properly) * @jce: journal callback data (internal and function private data struct) * * The registered function will be called in the context of the journal thread * after the transaction for which the handle was created has completed. * * No locks are held when the callback function is called, so it is safe to * call blocking functions from within the callback, but the callback should * not block or run for too long, or the filesystem will be blocked waiting for * the next transaction to commit. No journaling functions can be used, or * there is a risk of deadlock. * * There is no guaranteed calling order of multiple registered callbacks on * the same transaction. */ static inline void _ext4_journal_callback_add(handle_t *handle, struct ext4_journal_cb_entry *jce) { /* Add the jce to transaction's private list */ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); } static inline void ext4_journal_callback_add(handle_t *handle, void (*func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int rc), struct ext4_journal_cb_entry *jce) { struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); /* Add the jce to transaction's private list */ jce->jce_func = func; spin_lock(&sbi->s_md_lock); _ext4_journal_callback_add(handle, jce); spin_unlock(&sbi->s_md_lock); } /** * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); return deleted; } int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); #define ext4_mark_inode_dirty(__h, __i) \ __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); #define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ (bh), (trigger_type)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) #define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ (bh), (trigger_type)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) /* Note: Do not use this for NULL handles. This is only to determine if * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } static inline void ext4_handle_sync(handle_t *handle) { if (ext4_handle_valid(handle)) handle->h_sync = 1; } static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) return is_handle_aborted(handle); return 0; } static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, int blocks) { /* Freeing each metadata block can result in freeing one cluster */ return blocks * EXT4_SB(sb)->s_cluster_ratio; } static inline int ext4_trans_default_revoke_credits(struct super_block *sb) { return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) #define ext4_journal_start_reserved(handle, type) \ __ext4_journal_start_reserved((handle), __LINE__, (type)) handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); } static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2_journal_extend(handle, nblocks, revoke); return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred); /* * Ensure @handle has at least @check_creds credits available. If not, * transaction will be extended or restarted to contain at least @extend_cred * credits. Before restarting transaction @fn is executed to allow for cleanup * before the transaction is restarted. * * The return value is < 0 in case of error, 0 in case the handle has enough * credits or transaction extension succeeded, 1 in case transaction had to be * restarted. */ #define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ revoke_cred, fn) \ ({ \ __label__ __ensure_end; \ int err = __ext4_journal_ensure_credits((handle), (check_cred), \ (extend_cred), (revoke_cred)); \ \ if (err <= 0) \ goto __ensure_end; \ err = (fn); \ if (err < 0) \ goto __ensure_end; \ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ if (err == 0) \ err = 1; \ __ensure_end: \ err; \ }) /* * Ensure given handle has at least requested amount of credits available, * possibly restarting transaction if needed. We also make sure the transaction * has space for at least ext4_trans_default_revoke_credits(sb) revoke records * as freeing one or two blocks is very common pattern and requesting this is * very cheap. */ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, int revoke_creds) { return ext4_journal_ensure_credits_fn(handle, credits, credits, revoke_creds, 0); } static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) return jbd2_journal_blocks_per_page(inode); return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { if (journal) return jbd2_journal_force_commit(journal); return 0; } static inline int ext4_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_write(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline int ext4_jbd2_inode_add_wait(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_wait(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline void ext4_update_inode_fsync_trans(handle_t *handle, struct inode *inode, int datasync) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; } } /* super.c */ int ext4_force_commit(struct super_block *sb); /* * Ext4 inode journal modes */ #define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) { if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) return 0; if (!ext4_should_journal_data(inode)) return 0; /* * Data blocks in one extent are contiguous, just account for partial * clusters at extent boundaries */ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); } /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_rwsem for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return 0; if (ext4_should_journal_data(inode)) return 0; /* temporary fix to prevent generic/422 test failures */ if (!test_opt(inode->i_sb, DELALLOC)) return 0; return 1; } #endif /* _EXT4_JBD2_H */ |
470 824 1804 1804 1804 1804 5754 1073 720 636 636 5 5753 5753 636 636 5753 5754 5753 912 5754 912 5754 5754 5754 2076 2076 2076 5754 5754 5753 5754 601 601 5754 1813 1805 1804 1805 5752 5 5754 5754 5754 3331 3330 5754 5755 5753 5754 30 30 30 5 5 5 5 105 105 105 104 104 104 105 5741 5741 225 5741 225 5741 30 5633 5741 3 3 3 3 3 3 500 494 492 500 188 500 60 60 60 500 317 317 167 3810 3810 3167 3810 3804 293 3804 3532 3532 3532 1562 1562 1562 1455 192 167 1452 3809 40 3809 5752 5750 5749 5750 1781 5744 5752 3810 3808 936 936 936 936 936 935 5752 5751 1687 5752 5737 3 3 3900 3900 3900 3900 4 3900 1444 3668 224 225 225 225 207 158 48 36 36 36 36 12 36 213 5752 5752 5740 5741 5740 27 26 26 8 26 27 5740 117 5740 5741 3675 3660 3600 3672 784 3662 3661 3672 3672 3672 3672 3672 3671 3670 206 207 3668 90 90 90 848 848 847 848 3676 3676 3676 3676 3676 3676 3675 3668 3667 3675 3668 170 3666 3674 1833 1833 1149 1831 1831 1711 1831 73 73 73 1831 1831 1833 1771 220 325 64 8 325 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 | // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/transaction.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved * * Generic filesystem transaction handling code; part of the ext2fs * journaling system. * * This file manages transactions (compound commits managed by the * journaling code) and handles (individual atomic operations by the * filesystem). */ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/backing-dev.h> #include <linux/bug.h> #include <linux/module.h> #include <linux/sched/mm.h> #include <trace/events/jbd2.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); static struct kmem_cache *transaction_cache; int __init jbd2_journal_init_transaction_cache(void) { J_ASSERT(!transaction_cache); transaction_cache = kmem_cache_create("jbd2_transaction_s", sizeof(transaction_t), 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); if (!transaction_cache) { pr_emerg("JBD2: failed to create transaction cache\n"); return -ENOMEM; } return 0; } void jbd2_journal_destroy_transaction_cache(void) { kmem_cache_destroy(transaction_cache); transaction_cache = NULL; } void jbd2_journal_free_transaction(transaction_t *transaction) { if (unlikely(ZERO_OR_NULL_PTR(transaction))) return; kmem_cache_free(transaction_cache, transaction); } /* * Base amount of descriptor blocks we reserve for each transaction. */ static int jbd2_descriptor_blocks_per_trans(journal_t *journal) { int tag_space = journal->j_blocksize - sizeof(journal_header_t); int tags_per_block; /* Subtract UUID */ tag_space -= 16; if (jbd2_journal_has_csum_v2or3(journal)) tag_space -= sizeof(struct jbd2_journal_block_tail); /* Commit code leaves a slack space of 16 bytes at the end of block */ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); /* * Revoke descriptors are accounted separately so we need to reserve * space for commit block and normal transaction descriptor blocks. */ return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers, tags_per_block); } /* * jbd2_get_transaction: obtain a new transaction_t object. * * Simply initialise a new transaction. Initialize it in * RUNNING state and add it to the current journal (which should not * have an existing running transaction: we only make a new transaction * once we have started to commit the old one). * * Preconditions: * The journal MUST be locked. We don't perform atomic mallocs on the * new transaction and we can't block without protecting against other * processes trying to touch the journal while it is in transition. * */ static void jbd2_get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, jbd2_descriptor_blocks_per_trans(journal) + atomic_read(&journal->j_reserved_credits)); atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; transaction->t_max_wait = 0; transaction->t_start = jiffies; transaction->t_requested = 0; } /* * Handle management. * * A handle_t is an object which represents a single atomic update to a * filesystem, and which tracks all of the modifications which form part * of that one update. */ /* * Update transaction's maximum wait time, if debugging is enabled. * * t_max_wait is carefully updated here with use of atomic compare exchange. * Note that there could be multiplre threads trying to do this simultaneously * hence using cmpxchg to avoid any use of locks in this case. * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. */ static inline void update_t_max_wait(transaction_t *transaction, unsigned long ts) { unsigned long oldts, newts; if (time_after(transaction->t_start, ts)) { newts = jbd2_time_diff(ts, transaction->t_start); oldts = READ_ONCE(transaction->t_max_wait); while (oldts < newts) oldts = cmpxchg(&transaction->t_max_wait, oldts, newts); } } /* * Wait until running transaction passes to T_FLUSH state and new transaction * can thus be started. Also starts the commit if needed. The function expects * running transaction to exist and releases j_state_lock. */ static void wait_transaction_locked(journal_t *journal) __releases(journal->j_state_lock) { DEFINE_WAIT(wait); int need_to_start; tid_t tid = journal->j_running_transaction->t_tid; prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); jbd2_might_wait_for_commit(journal); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } /* * Wait until running transaction transitions from T_SWITCH to T_FLUSH * state and new transaction can thus be started. The function releases * j_state_lock. */ static void wait_transaction_switching(journal_t *journal) __releases(journal->j_state_lock) { DEFINE_WAIT(wait); if (WARN_ON(!journal->j_running_transaction || journal->j_running_transaction->t_state != T_SWITCH)) { read_unlock(&journal->j_state_lock); return; } prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); /* * We don't call jbd2_might_wait_for_commit() here as there's no * waiting for outstanding handles happening anymore in T_SWITCH state * and handling of reserved handles actually relies on that for * correctness. */ schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } static void sub_reserved_credits(journal_t *journal, int blocks) { atomic_sub(blocks, &journal->j_reserved_credits); wake_up(&journal->j_wait_reserved); } /* * Wait until we can add credits for handle to the running transaction. Called * with j_state_lock held for reading. Returns 0 if handle joined the running * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and * caller must retry. * * Note: because j_state_lock may be dropped depending on the return * value, we need to fake out sparse so ti doesn't complain about a * locking imbalance. Callers of add_transaction_credits will need to * make a similar accomodation. */ static int add_transaction_credits(journal_t *journal, int blocks, int rsv_blocks) __must_hold(&journal->j_state_lock) { transaction_t *t = journal->j_running_transaction; int needed; int total = blocks + rsv_blocks; /* * If the current transaction is locked down for commit, wait * for the lock to be released. */ if (t->t_state != T_RUNNING) { WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } /* * If there is not enough space left in the log to write all * potential buffers requested by this operation, we need to * stall pending a log checkpoint to free some more log space. */ needed = atomic_add_return(total, &t->t_outstanding_credits); if (needed > journal->j_max_transaction_buffers) { /* * If the current transaction is already too large, * then start to commit it: we can then go back and * attach this handle to a new transaction. */ atomic_sub(total, &t->t_outstanding_credits); /* * Is the number of reserved credits in the current transaction too * big to fit this handle? Wait until reserved credits are freed. */ if (atomic_read(&journal->j_reserved_credits) + total > journal->j_max_transaction_buffers) { read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } wait_transaction_locked(journal); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } /* * The commit code assumes that it can get enough log space * without forcing a checkpoint. This is *critical* for * correctness: a checkpoint of a buffer which is also * associated with a committing transaction creates a deadlock, * so commit simply cannot force through checkpoints. * * We must therefore ensure the necessary space in the journal * *before* starting to dirty potentially checkpointed buffers * in the new transaction. */ if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } /* No reservation? We are done... */ if (!rsv_blocks) return 0; needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); /* We allow at most half of a transaction to be reserved */ if (needed > journal->j_max_transaction_buffers / 2) { sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } return 0; } /* * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the * transaction's buffer credits. */ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; int blocks = handle->h_total_credits; int rsv_blocks = 0; unsigned long ts = jiffies; if (handle->h_rsv_handle) rsv_blocks = handle->h_rsv_handle->h_total_credits; /* * Limit the number of reserved credits to 1/2 of maximum transaction * size and limit the number of total credits to not exceed maximum * transaction size per operation. */ if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { printk(KERN_ERR "JBD2: %s wants too many credits " "credits:%d rsv_credits:%d max:%d\n", current->comm, blocks, rsv_blocks, journal->j_max_transaction_buffers); WARN_ON(1); return -ENOSPC; } alloc_transaction: /* * This check is racy but it is just an optimization of allocating new * transaction early if there are high chances we'll need it. If we * guess wrong, we'll retry or free unused transaction. */ if (!data_race(journal->j_running_transaction)) { /* * If __GFP_FS is not present, then we may be being called from * inside the fs writeback layer, so we MUST NOT fail. */ if ((gfp_mask & __GFP_FS) == 0) gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); if (!new_transaction) return -ENOMEM; } jbd2_debug(3, "New handle %p going live.\n", handle); /* * We need to hold j_state_lock until t_updates has been incremented, * for proper journal barrier handling */ repeat: read_lock(&journal->j_state_lock); BUG_ON(journal->j_flags & JBD2_UNMOUNT); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { read_unlock(&journal->j_state_lock); jbd2_journal_free_transaction(new_transaction); return -EROFS; } /* * Wait on the journal's transaction barrier if necessary. Specifically * we allow reserved handles to proceed because otherwise commit could * deadlock on page writeback not being able to complete. */ if (!handle->h_reserved && journal->j_barrier_count) { read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); goto repeat; } if (!journal->j_running_transaction) { read_unlock(&journal->j_state_lock); if (!new_transaction) goto alloc_transaction; write_lock(&journal->j_state_lock); if (!journal->j_running_transaction && (handle->h_reserved || !journal->j_barrier_count)) { jbd2_get_transaction(journal, new_transaction); new_transaction = NULL; } write_unlock(&journal->j_state_lock); goto repeat; } transaction = journal->j_running_transaction; if (!handle->h_reserved) { /* We may have dropped j_state_lock - restart in that case */ if (add_transaction_credits(journal, blocks, rsv_blocks)) { /* * add_transaction_credits releases * j_state_lock on a non-zero return */ __release(&journal->j_state_lock); goto repeat; } } else { /* * We have handle reserved so we are allowed to join T_LOCKED * transaction and we don't have to check for transaction size * and journal space. But we still have to wait while running * transaction is being switched to a committing one as it * won't wait for any handles anymore. */ if (transaction->t_state == T_SWITCH) { wait_transaction_switching(journal); goto repeat; } sub_reserved_credits(journal, blocks); handle->h_reserved = 0; } /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; handle->h_requested_credits = blocks; handle->h_revoke_credits_requested = handle->h_revoke_credits; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", handle, blocks, atomic_read(&transaction->t_outstanding_credits), jbd2_log_space_left(journal)); read_unlock(&journal->j_state_lock); current->journal_info = handle; rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); /* * Ensure that no allocations done while the transaction is open are * going to recurse back to the fs layer. */ handle->saved_alloc_context = memalloc_nofs_save(); return 0; } /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; handle->h_total_credits = nblocks; handle->h_ref = 1; return handle; } handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; if (!journal) return ERR_PTR(-EROFS); if (handle) { J_ASSERT(handle->h_transaction->t_journal == journal); handle->h_ref++; return handle; } nblocks += DIV_ROUND_UP(revoke_records, journal->j_revoke_records_per_block); handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); if (rsv_blocks) { handle_t *rsv_handle; rsv_handle = new_handle(rsv_blocks); if (!rsv_handle) { jbd2_free_handle(handle); return ERR_PTR(-ENOMEM); } rsv_handle->h_reserved = 1; rsv_handle->h_journal = journal; handle->h_rsv_handle = rsv_handle; } handle->h_revoke_credits = revoke_records; err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { if (handle->h_rsv_handle) jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return ERR_PTR(err); } handle->h_type = type; handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); return handle; } EXPORT_SYMBOL(jbd2__journal_start); /** * jbd2_journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee * that much space. Additionally, if rsv_blocks > 0, we also create another * handle with rsv_blocks reserved blocks in the journal. This handle is * stored in h_rsv_handle. It is not attached to any particular transaction * and thus doesn't block transaction commit. If the caller uses this reserved * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() * on the parent handle will dispose the reserved one. Reserved handle has to * be converted to a normal handle using jbd2_journal_start_reserved() before * it can be used. * * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); sub_reserved_credits(journal, handle->h_total_credits); if (t) atomic_sub(handle->h_total_credits, &t->t_outstanding_credits); } void jbd2_journal_free_reserved(handle_t *handle) { journal_t *journal = handle->h_journal; /* Get j_state_lock to pin running transaction if it exists */ read_lock(&journal->j_state_lock); __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction); read_unlock(&journal->j_state_lock); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); /** * jbd2_journal_start_reserved() - start reserved handle * @handle: handle to start * @type: for handle statistics * @line_no: for handle statistics * * Start handle that has been previously reserved with jbd2_journal_reserve(). * This attaches @handle to the running transaction (or creates one if there's * not transaction running). Unlike jbd2_journal_start() this function cannot * block on journal commit, checkpointing, or similar stuff. It can block on * memory allocation or frozen journal though. * * Return 0 on success, non-zero on error - handle is freed in that case. */ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no) { journal_t *journal = handle->h_journal; int ret = -EIO; if (WARN_ON(!handle->h_reserved)) { /* Someone passed in normal handle? Just stop it. */ jbd2_journal_stop(handle); return ret; } /* * Usefulness of mixing of reserved and unreserved handles is * questionable. So far nobody seems to need it so just error out. */ if (WARN_ON(current->journal_info)) { jbd2_journal_free_reserved(handle); return ret; } handle->h_journal = NULL; /* * GFP_NOFS is here because callers are likely from writeback or * similarly constrained call sites */ ret = start_this_handle(journal, handle, GFP_NOFS); if (ret < 0) { handle->h_journal = journal; jbd2_journal_free_reserved(handle); return ret; } handle->h_type = type; handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, handle->h_total_credits); return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); /** * jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. * @revoke_records: number of revoke records to try to extend by. * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests * a credit for a number of buffer modifications in advance, but can * extend its credit if it needs more. * * jbd2_journal_extend tries to give the running handle more buffer credits. * It does not guarantee that allocation - this is a best-effort only. * The calling process MUST be able to deal cleanly with a failure to * extend here. * * Return 0 on success, non-zero on failure. * * return code < 0 implies an error * return code > 0 implies normal transaction-full status. */ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) { transaction_t *transaction = handle->h_transaction; journal_t *journal; int result; int wanted; if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; result = 1; read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ if (transaction->t_state != T_RUNNING) { jbd2_debug(3, "denied handle %p %d blocks: " "transaction not running\n", handle, nblocks); goto error_out; } nblocks += DIV_ROUND_UP( handle->h_revoke_credits_requested + revoke_records, journal->j_revoke_records_per_block) - DIV_ROUND_UP( handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); wanted = atomic_add_return(nblocks, &transaction->t_outstanding_credits); if (wanted > journal->j_max_transaction_buffers) { jbd2_debug(3, "denied handle %p %d blocks: " "transaction too large\n", handle, nblocks); atomic_sub(nblocks, &transaction->t_outstanding_credits); goto error_out; } trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, transaction->t_tid, handle->h_type, handle->h_line_no, handle->h_total_credits, nblocks); handle->h_total_credits += nblocks; handle->h_requested_credits += nblocks; handle->h_revoke_credits += revoke_records; handle->h_revoke_credits_requested += revoke_records; result = 0; jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks); error_out: read_unlock(&journal->j_state_lock); return result; } static void stop_this_handle(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int revokes; J_ASSERT(journal_current_handle() == handle); J_ASSERT(atomic_read(&transaction->t_updates) > 0); current->journal_info = NULL; /* * Subtract necessary revoke descriptor blocks from handle credits. We * take care to account only for revoke descriptor blocks the * transaction will really need as large sequences of transactions with * small numbers of revokes are relatively common. */ revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits; if (revokes) { int t_revokes, revoke_descriptors; int rr_per_blk = journal->j_revoke_records_per_block; WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk) > handle->h_total_credits); t_revokes = atomic_add_return(revokes, &transaction->t_outstanding_revokes); revoke_descriptors = DIV_ROUND_UP(t_revokes, rr_per_blk) - DIV_ROUND_UP(t_revokes - revokes, rr_per_blk); handle->h_total_credits -= revoke_descriptors; } atomic_sub(handle->h_total_credits, &transaction->t_outstanding_credits); if (handle->h_rsv_handle) __jbd2_journal_unreserve_handle(handle->h_rsv_handle, transaction); if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); /* * Scope of the GFP_NOFS context is over here and so we can restore the * original alloc context. */ memalloc_nofs_restore(handle->saved_alloc_context); } /** * jbd2__journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested * @revoke_records: number of revoke record credits requested * @gfp_mask: memory allocation flags (for start_this_handle) * * Restart a handle for a multi-transaction filesystem * operation. * * If the jbd2_journal_extend() call above fails to grant new buffer credits * to a running handle, a call to jbd2_journal_restart will commit the * handle's transaction so far and reattach the handle to a new * transaction capable of guaranteeing the requested number of * credits. We preserve reserved handle if there's any attached to the * passed in handle. */ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal; tid_t tid; int need_to_start; int ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; journal = transaction->t_journal; tid = transaction->t_tid; /* * First unlink the handle from its current transaction, and start the * commit on that. */ jbd2_debug(2, "restarting handle %p\n", handle); stop_this_handle(handle); handle->h_transaction = NULL; /* * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can * get rid of pointless j_state_lock traffic like this. */ read_lock(&journal->j_state_lock); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); handle->h_total_credits = nblocks + DIV_ROUND_UP(revoke_records, journal->j_revoke_records_per_block); handle->h_revoke_credits = revoke_records; ret = start_this_handle(journal, handle, gfp_mask); trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev, ret ? 0 : handle->h_transaction->t_tid, handle->h_type, handle->h_line_no, handle->h_total_credits); return ret; } EXPORT_SYMBOL(jbd2__journal_restart); int jbd2_journal_restart(handle_t *handle, int nblocks) { return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS); } EXPORT_SYMBOL(jbd2_journal_restart); /* * Waits for any outstanding t_updates to finish. * This is called with write j_state_lock held. */ void jbd2_journal_wait_updates(journal_t *journal) { DEFINE_WAIT(wait); while (1) { /* * Note that the running transaction can get freed under us if * this transaction is getting committed in * jbd2_journal_commit_transaction() -> * jbd2_journal_free_transaction(). This can only happen when we * release j_state_lock -> schedule() -> acquire j_state_lock. * Hence we should everytime retrieve new j_running_transaction * value (after j_state_lock release acquire cycle), else it may * lead to use-after-free of old freed transaction. */ transaction_t *transaction = journal->j_running_transaction; if (!transaction) break; prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (!atomic_read(&transaction->t_updates)) { finish_wait(&journal->j_wait_updates, &wait); break; } write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_updates, &wait); write_lock(&journal->j_state_lock); } } /** * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * * This locks out any further updates from being started, and blocks * until all existing updates have completed, returning only once the * journal is in a quiescent state with no updates running. * * The journal lock should not be held on entry. */ void jbd2_journal_lock_updates(journal_t *journal) { jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); ++journal->j_barrier_count; /* Wait until there are no reserved handles */ if (atomic_read(&journal->j_reserved_credits)) { write_unlock(&journal->j_state_lock); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) == 0); write_lock(&journal->j_state_lock); } /* Wait until there are no running t_updates */ jbd2_journal_wait_updates(journal); write_unlock(&journal->j_state_lock); /* * We have now established a barrier against other normal updates, but * we also need to barrier against other jbd2_journal_lock_updates() calls * to make sure that we serialise special journal-locked operations * too. */ mutex_lock(&journal->j_barrier); } /** * jbd2_journal_unlock_updates () - release barrier * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with jbd2_journal_lock_updates(). * * Should be called without the journal lock held. */ void jbd2_journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); mutex_unlock(&journal->j_barrier); write_lock(&journal->j_state_lock); --journal->j_barrier_count; write_unlock(&journal->j_state_lock); wake_up_all(&journal->j_wait_transaction_locked); } static void warn_dirty_buffer(struct buffer_head *bh) { printk(KERN_WARNING "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", bh->b_bdev, (unsigned long long)bh->b_blocknr); } /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ static void jbd2_freeze_jh_data(struct journal_head *jh) { char *source; struct buffer_head *bh = jh2bh(jh); J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); source = kmap_local_folio(bh->b_folio, bh_offset(bh)); /* Fire data frozen trigger just before we copy the data */ jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); memcpy(jh->b_frozen_data, source, bh->b_size); kunmap_local(source); /* * Now that the frozen data is saved off, we need to store any matching * triggers. */ jh->b_frozen_triggers = jh->b_triggers; } /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior * transaction which we are still committing to disk, then we need to * make sure that we do not overwrite the old copy: we do copy-out to * preserve the copy going to disk. We also account the buffer against * the handle's metadata buffer credits (unless the buffer is already * part of the transaction, that is). * */ static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { struct buffer_head *bh; transaction_t *transaction = handle->h_transaction; journal_t *journal; int error; char *frozen_buffer = NULL; unsigned long start_lock, time_lock; journal = transaction->t_journal; jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: bh = jh2bh(jh); /* @@@ Need to check for errors here at some point. */ start_lock = jiffies; lock_buffer(bh); spin_lock(&jh->b_state_lock); /* If it takes too long to lock the buffer, trace it */ time_lock = jbd2_time_diff(start_lock, jiffies); if (time_lock > HZ/10) trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, jiffies_to_msecs(time_lock)); /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? * * If so, there are two possibilities. The buffer may be * non-journaled, and undergoing a quite legitimate writeback. * Otherwise, it is journaled, and we don't expect dirty buffers * in that state (the buffers should be marked JBD_Dirty * instead.) So either the IO is being done under our own * control and this is a bug, or it's a third party IO such as * dump(8) (which may leave the buffer scheduled for read --- * ie. locked but not dirty) or tune2fs (which may actually have * the buffer dirtied, ugh.) */ if (buffer_dirty(bh) && jh->b_transaction) { warn_dirty_buffer(bh); /* * We need to clean the dirty flag and we must do it under the * buffer lock to be sure we don't race with running write-out. */ JBUFFER_TRACE(jh, "Journalling dirty buffer"); clear_buffer_dirty(bh); /* * The buffer is going to be added to BJ_Reserved list now and * nothing guarantees jbd2_journal_dirty_metadata() will be * ever called for it. So we need to set jbddirty bit here to * make sure the buffer is dirtied and written out when the * journaling machinery is done with it. */ set_buffer_jbddirty(bh); } error = -EROFS; if (is_handle_aborted(handle)) { spin_unlock(&jh->b_state_lock); unlock_buffer(bh); goto out; } error = 0; /* * The buffer is already part of this transaction if b_transaction or * b_next_transaction points to it */ if (jh->b_transaction == transaction || jh->b_next_transaction == transaction) { unlock_buffer(bh); goto done; } /* * this is the first time this transaction is touching this buffer, * reset the modified flag */ jh->b_modified = 0; /* * If the buffer is not journaled right now, we need to make sure it * doesn't get written to disk before the caller actually commits the * new data */ if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); JBUFFER_TRACE(jh, "file as BJ_Reserved"); /* * Make sure all stores to jh (b_modified, b_frozen_data) are * visible before attaching it to the running transaction. * Paired with barrier in jbd2_write_access_granted() */ smp_wmb(); spin_lock(&journal->j_list_lock); if (test_clear_buffer_dirty(bh)) { /* * Execute buffer dirty clearing and jh->b_transaction * assignment under journal->j_list_lock locked to * prevent bh being removed from checkpoint list if * the buffer is in an intermediate state (not dirty * and jh->b_transaction is NULL). */ JBUFFER_TRACE(jh, "Journalling dirty buffer"); set_buffer_jbddirty(bh); } __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); unlock_buffer(bh); goto done; } unlock_buffer(bh); /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); goto attach_next; } JBUFFER_TRACE(jh, "owned by older transaction"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); /* * There is one case we have to be very careful about. If the * committing transaction is currently writing this buffer out to disk * and has NOT made a copy-out, then we cannot modify the buffer * contents at all right now. The essence of copy-out is that it is * the extra copy, not the primary copy, which gets journaled. If the * primary copy is already going to disk then we cannot do copy-out * here. */ if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); spin_unlock(&jh->b_state_lock); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } /* * Only do the copy if the currently-owning transaction still needs it. * If buffer isn't on BJ_Metadata list, the committing transaction is * past that stage (here we use the fact that BH_Shadow is set under * bh_state lock together with refiling to BJ_Shadow list and at this * point we know the buffer doesn't have BH_Shadow set). * * Subtle point, though: if this is a get_undo_access, then we will be * relying on the frozen_data to contain the new value of the * committed_data record after the transaction, so we HAVE to force the * frozen_data copy in that case. */ if (jh->b_jlist == BJ_Metadata || force_copy) { JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); spin_unlock(&jh->b_state_lock); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; } jh->b_frozen_data = frozen_buffer; frozen_buffer = NULL; jbd2_freeze_jh_data(jh); } attach_next: /* * Make sure all stores to jh (b_modified, b_frozen_data) are visible * before attaching it to the running transaction. Paired with barrier * in jbd2_write_access_granted() */ smp_wmb(); jh->b_next_transaction = transaction; done: spin_unlock(&jh->b_state_lock); /* * If we are about to journal a buffer, then any revoke pending on it is * no longer valid */ jbd2_journal_cancel_revoke(handle, jh); out: if (unlikely(frozen_buffer)) /* It's usually NULL */ jbd2_free(frozen_buffer, bh->b_size); JBUFFER_TRACE(jh, "exit"); return error; } /* Fast check whether buffer is already attached to the required transaction */ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, bool undo) { struct journal_head *jh; bool ret = false; /* Dirty buffers require special handling... */ if (buffer_dirty(bh)) return false; /* * RCU protects us from dereferencing freed pages. So the checks we do * are guaranteed not to oops. However the jh slab object can get freed * & reallocated while we work with it. So we have to be careful. When * we see jh attached to the running transaction, we know it must stay * so until the transaction is committed. Thus jh won't be freed and * will be attached to the same bh while we run. However it can * happen jh gets freed, reallocated, and attached to the transaction * just after we get pointer to it from bh. So we have to be careful * and recheck jh still belongs to our bh before we return success. */ rcu_read_lock(); if (!buffer_jbd(bh)) goto out; /* This should be bh2jh() but that doesn't work with inline functions */ jh = READ_ONCE(bh->b_private); if (!jh) goto out; /* For undo access buffer must have data copied */ if (undo && !jh->b_committed_data) goto out; if (READ_ONCE(jh->b_transaction) != handle->h_transaction && READ_ONCE(jh->b_next_transaction) != handle->h_transaction) goto out; /* * There are two reasons for the barrier here: * 1) Make sure to fetch b_bh after we did previous checks so that we * detect when jh went through free, realloc, attach to transaction * while we were checking. Paired with implicit barrier in that path. * 2) So that access to bh done after jbd2_write_access_granted() * doesn't get reordered and see inconsistent state of concurrent * do_get_write_access(). */ smp_mb(); if (unlikely(jh->b_bh != bh)) goto out; ret = true; out: rcu_read_unlock(); return ret; } /** * jbd2_journal_get_write_access() - notify intent to modify a buffer * for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * * Returns: error code or 0 on success. * * In full data journalling mode the buffer may be of type BJ_AsyncData, * because we're ``write()ing`` a buffer which is also part of a shared mapping. */ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { struct journal_head *jh; int rc; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_write_access_granted(handle, bh, false)) return 0; jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ rc = do_get_write_access(handle, jh, 0); jbd2_journal_put_journal_head(jh); return rc; } /* * When the user wants to journal a newly created buffer_head * (ie. getblk() returned a new buffer and we are going to populate it * manually rather than reading off disk), then we need to keep the * buffer_head locked until it has been completely filled with new * data. In this case, we should be able to make the assertion that * the bh is not already part of an existing transaction. * * The buffer should already be locked by the caller by this point. * There is no lock ranking violation: it was a newly created, * unlocked buffer beforehand. */ /** * jbd2_journal_get_create_access () - notify intent to use newly created bh * @handle: transaction to new buffer to * @bh: new buffer. * * Call this if you create a new bh. */ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; struct journal_head *jh = jbd2_journal_add_journal_head(bh); int err; jbd2_debug(5, "journal_head %p\n", jh); err = -EROFS; if (is_handle_aborted(handle)) goto out; journal = transaction->t_journal; err = 0; JBUFFER_TRACE(jh, "entry"); /* * The buffer may already belong to this transaction due to pre-zeroing * in the filesystem's new_block code. It may also be on the previous, * committing transaction's lists, but it HAS to be in Forget state in * that case: the transaction must have deleted the buffer for it to be * reused here. */ spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && jh->b_jlist == BJ_Forget))); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { /* * Previous jbd2_journal_forget() could have left the buffer * with jbddirty bit set because it was being committed. When * the commit finished, we've filed the buffer for * checkpointing and marked it dirty. Now we are reallocating * the buffer so the transaction freeing it must have * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "set next transaction"); spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } spin_unlock(&jh->b_state_lock); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect * blocks which contain freed but then revoked metadata. We need * to cancel the revoke in case we end up freeing it yet again * and the reallocating as data - this would cause a second revoke, * which hits an assertion error. */ JBUFFER_TRACE(jh, "cancelling revoke"); jbd2_journal_cancel_revoke(handle, jh); out: jbd2_journal_put_journal_head(jh); return err; } /** * jbd2_journal_get_undo_access() - Notify intent to modify metadata with * non-rewindable consequences * @handle: transaction * @bh: buffer to undo * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses * this for freeing and allocating space, we have to make sure that we * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete * un-rewindable in case of a crash. * * To deal with that, jbd2_journal_get_undo_access requests write access to a * buffer for parts of non-rewindable operations such as delete * operations on the bitmaps. The journaling code must keep a copy of * the buffer's contents prior to the undo_access call until such time * as we know that the buffer has definitely been committed to disk. * * We never need to know which transaction the committed data is part * of, buffers touched here are guaranteed to be dirtied later and so * will be committed to a new transaction in due course, at which point * we can discard the old committed data pointer. * * Returns error number or 0 on success. */ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; struct journal_head *jh; char *committed_data = NULL; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done * atomically wrt. completion of any outstanding commits. */ err = do_get_write_access(handle, jh, 1); if (err) goto out; repeat: if (!jh->b_committed_data) committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); spin_lock(&jh->b_state_lock); if (!jh->b_committed_data) { /* Copy out the current buffer contents into the * preserved, committed copy. */ JBUFFER_TRACE(jh, "generate b_committed data"); if (!committed_data) { spin_unlock(&jh->b_state_lock); goto repeat; } jh->b_committed_data = committed_data; committed_data = NULL; memcpy(jh->b_committed_data, bh->b_data, bh->b_size); } spin_unlock(&jh->b_state_lock); out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) jbd2_free(committed_data, bh->b_size); return err; } /** * jbd2_journal_set_triggers() - Add triggers for commit writeout * @bh: buffer to trigger on * @type: struct jbd2_buffer_trigger_type containing the trigger(s). * * Set any triggers on this journal_head. This is always safe, because * triggers for a committing buffer will be saved off, and triggers for * a running transaction will match the buffer in that transaction. * * Call with NULL to clear the triggers. */ void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { struct journal_head *jh = jbd2_journal_grab_journal_head(bh); if (WARN_ON_ONCE(!jh)) return; jh->b_triggers = type; jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); if (!triggers || !triggers->t_frozen) return; triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers) { if (!triggers || !triggers->t_abort) return; triggers->t_abort(triggers, jh2bh(jh)); } /** * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark * * mark dirty metadata which needs to be journaled as part of the current * transaction. * * The buffer must have previously had jbd2_journal_get_write_access() * called so that it has a valid journal_head attached to the buffer * head. * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * * Returns error number or 0 on success. * * Special care needs to be taken if the buffer already belongs to the * current committing transaction (in which case we should have frozen * data present for that commit). In that case, we don't relink the * buffer: that only gets done when the old transaction finally * completes its commit. */ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; struct journal_head *jh; int ret = 0; if (!buffer_jbd(bh)) return -EUCLEAN; /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); jbd2_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is * crucial to catch bugs so let's do a reliable check until the * lockless handling is fully proven. */ if (data_race(jh->b_transaction != transaction && jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) pr_err("JBD2: assertion failure: h_type=%u " "h_line_no=%u block_no=%llu jlist=%u\n", handle->h_type, handle->h_line_no, (unsigned long long) bh->b_blocknr, jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); spin_unlock(&jh->b_state_lock); } goto out; } journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { /* * Check journal aborting with @jh->b_state_lock locked, * since 'jh->b_transaction' could be replaced with * 'jh->b_next_transaction' during old transaction * committing if journal aborted, which may fail * assertion on 'jh->b_frozen_data == NULL'. */ ret = -EROFS; goto out_unlock_bh; } if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part * of the transaction. This needs to be done * once a transaction -bzzz */ if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) { ret = -ENOSPC; goto out_unlock_bh; } jh->b_modified = 1; handle->h_total_credits--; } /* * fastpath, to avoid expensive locking. If this buffer is already * on the running transaction's metadata list there is nothing to do. * Nobody can take it off again because there is a handle open. * I _think_ we're OK here with SMP barriers - a mistaken decision will * result in this test being false, so we go in and take the locks. */ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { JBUFFER_TRACE(jh, "fastpath"); if (unlikely(jh->b_transaction != journal->j_running_transaction)) { printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_running_transaction (%p, %u)\n", journal->j_devname, (unsigned long long) bh->b_blocknr, jh->b_transaction, jh->b_transaction ? jh->b_transaction->t_tid : 0, journal->j_running_transaction, journal->j_running_transaction ? journal->j_running_transaction->t_tid : 0); ret = -EINVAL; } goto out_unlock_bh; } set_buffer_jbddirty(bh); /* * Metadata already on the current transaction list doesn't * need to be filed. Metadata on another transaction's list must * be committing, and will be refiled once the commit completes: * leave it alone for now. */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); if (unlikely(((jh->b_transaction != journal->j_committing_transaction)) || (jh->b_next_transaction != transaction))) { printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " "bad jh for block %llu: " "transaction (%p, %u), " "jh->b_transaction (%p, %u), " "jh->b_next_transaction (%p, %u), jlist %u\n", journal->j_devname, (unsigned long long) bh->b_blocknr, transaction, transaction->t_tid, jh->b_transaction, jh->b_transaction ? jh->b_transaction->t_tid : 0, jh->b_next_transaction, jh->b_next_transaction ? jh->b_next_transaction->t_tid : 0, jh->b_jlist); WARN_ON(1); ret = -EINVAL; } /* And this case is illegal: we can't reuse another * transaction's data buffer, ever. */ goto out_unlock_bh; } /* That test should have eliminated the following case: */ J_ASSERT_JH(jh, jh->b_frozen_data == NULL); JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: spin_unlock(&jh->b_state_lock); out: JBUFFER_TRACE(jh, "exit"); return ret; } /** * jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * * We can only do the bforget if there are no commits pending against the * buffer. If the buffer is dirty in the current running transaction we * can safely unlink it. * * bh may not be a journalled buffer at all - it may be a non-JBD * buffer which came off the hashtable. Check for this. * * Decrements bh->b_count by one. * * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; struct journal_head *jh; int drop_reserve = 0; int err = 0; int was_modified = 0; if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; BUFFER_TRACE(bh, "entry"); jh = jbd2_journal_grab_journal_head(bh); if (!jh) { __bforget(bh); return 0; } spin_lock(&jh->b_state_lock); /* Critical error: attempting to delete a bitmap buffer, maybe? * Don't do any jbd operations, and return an error. */ if (!J_EXPECT_JH(jh, !jh->b_committed_data, "inconsistent data on disk")) { err = -EIO; goto drop; } /* keep track of whether or not this transaction modified us */ was_modified = jh->b_modified; /* * The buffer's going from the transaction, we must drop * all references -bzzz */ jh->b_modified = 0; if (jh->b_transaction == transaction) { J_ASSERT_JH(jh, !jh->b_frozen_data); /* If we are forgetting a buffer which is already part * of this transaction, then we can just drop it from * the transaction immediately. */ clear_buffer_dirty(bh); clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); /* * we only want to drop a reference if this transaction * modified the buffer */ if (was_modified) drop_reserve = 1; /* * We are no longer going to journal this buffer. * However, the commit of this transaction is still * important to the buffer: the delete that we are now * processing might obsolete an old log entry, so by * committing, we can satisfy the buffer's checkpoint. * * So, if we have a checkpoint on the buffer, we should * now refile the buffer on our BJ_Forget list so that * we know to remove the checkpoint after we commit. */ spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction) { __jbd2_journal_temp_unlink_buffer(jh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); jbd2_journal_put_journal_head(jh); } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); /* ... but we CAN drop it from the new transaction through * marking the buffer as freed and set j_next_transaction to * the new transaction, so that not only the commit code * knows it should clear dirty bits when it is done with the * buffer, but also the buffer can be checkpointed only * after the new transaction commits. */ set_buffer_freed(bh); if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } else { J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified * the buffer */ if (was_modified) drop_reserve = 1; } } else { /* * Finally, if the buffer is not belongs to any * transaction, we can just drop it now if it has no * checkpoint. */ spin_lock(&journal->j_list_lock); if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "belongs to none transaction"); spin_unlock(&journal->j_list_lock); goto drop; } /* * Otherwise, if the buffer has been written to disk, * it is safe to remove the checkpoint and drop it. */ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { spin_unlock(&journal->j_list_lock); goto drop; } /* * The buffer is still not written to disk, we should * attach this buffer to current transaction so that the * buffer can be checkpointed only after the current * transaction commits. */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } drop: __brelse(bh); spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ handle->h_total_credits++; } return err; } /** * jbd2_journal_stop() - complete a transaction * @handle: transaction to complete. * * All done for a particular handle. * * There is not much action needed here. We just return any remaining * buffer credits to the transaction and remove the handle. The only * complication is that we need to start a commit operation if the * filesystem is marked for synchronous update. * * jbd2_journal_stop itself will not usually return an error, but it may * do so in unusual circumstances. In particular, expect it to * return -EIO if a jbd2_journal_abort has been executed since the * transaction began. */ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal; int err = 0, wait_for_commit = 0; tid_t tid; pid_t pid; if (--handle->h_ref > 0) { jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, handle->h_ref); if (is_handle_aborted(handle)) return -EIO; return 0; } if (!transaction) { /* * Handle is already detached from the transaction so there is * nothing to do other than free the handle. */ memalloc_nofs_restore(handle->saved_alloc_context); goto free_and_exit; } journal = transaction->t_journal; tid = transaction->t_tid; if (is_handle_aborted(handle)) err = -EIO; jbd2_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, (handle->h_requested_credits - handle->h_total_credits)); /* * Implement synchronous transaction batching. If the handle * was synchronous, don't force a commit immediately. Let's * yield and let another thread piggyback onto this * transaction. Keep doing that while new threads continue to * arrive. It doesn't cost much - we're about to run a commit * and sleep on IO anyway. Speeds up many-threaded, many-dir * operations by 30x or more... * * We try and optimize the sleep time against what the * underlying disk can do, instead of having a static sleep * time. This is useful for the case where our storage is so * fast that it is more optimal to go ahead and force a flush * and wait for the transaction to be committed than it is to * wait for an arbitrary amount of time for new writers to * join the transaction. We achieve this by measuring how * long it takes to commit a transaction, and compare it with * how long this transaction has been running, and if run time * < commit time then we sleep for the delta and commit. This * greatly helps super fast disks that would see slowdowns as * more threads started doing fsyncs. * * But don't do this if this process was the most recent one * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. * * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid && journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; read_lock(&journal->j_state_lock); commit_time = journal->j_average_commit_time; read_unlock(&journal->j_state_lock); trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); commit_time = max_t(u64, commit_time, 1000*journal->j_min_batch_time); commit_time = min_t(u64, commit_time, 1000*journal->j_max_batch_time); if (trans_time < commit_time) { ktime_t expires = ktime_add_ns(ktime_get(), commit_time); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); } } if (handle->h_sync) transaction->t_synchronous_commit = 1; /* * If the handle is marked SYNC, we need to set another commit * going! We also want to force a commit if the transaction is too * old now. */ if (handle->h_sync || time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ jbd2_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ jbd2_log_start_commit(journal, tid); /* * Special case: JBD2_SYNC synchronous updates require us * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) wait_for_commit = 1; } /* * Once stop_this_handle() drops t_updates, the transaction could start * committing on us and eventually disappear. So we must not * dereference transaction pointer again after calling * stop_this_handle(). */ stop_this_handle(handle); if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); free_and_exit: if (handle->h_rsv_handle) jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return err; } /* * * List management code snippets: various functions for manipulating the * transaction buffer lists. * */ /* * Append a buffer to a transaction list, given the transaction's list head * pointer. * * j_list_lock is held. * * jh->b_state_lock is held. */ static inline void __blist_add_buffer(struct journal_head **list, struct journal_head *jh) { if (!*list) { jh->b_tnext = jh->b_tprev = jh; *list = jh; } else { /* Insert at the tail of the list to preserve order */ struct journal_head *first = *list, *last = first->b_tprev; jh->b_tprev = last; jh->b_tnext = first; last->b_tnext = first->b_tprev = jh; } } /* * Remove a buffer from a transaction list, given the transaction's list * head pointer. * * Called with j_list_lock held, and the journal may not be locked. * * jh->b_state_lock is held. */ static inline void __blist_del_buffer(struct journal_head **list, struct journal_head *jh) { if (*list == jh) { *list = jh->b_tnext; if (*list == jh) *list = NULL; } jh->b_tprev->b_tnext = jh->b_tnext; jh->b_tnext->b_tprev = jh->b_tprev; } /* * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or * t_reserved_list. If the caller is holding onto a copy of one of these * pointers, it could go bad. Generally the caller needs to re-read the * pointer from the transaction_t. * * Called under j_list_lock. */ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); lockdep_assert_held(&jh->b_state_lock); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); if (jh->b_jlist != BJ_None) J_ASSERT_JH(jh, transaction != NULL); switch (jh->b_jlist) { case BJ_None: return; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); list = &transaction->t_buffers; break; case BJ_Forget: list = &transaction->t_forget; break; case BJ_Shadow: list = &transaction->t_shadow_list; break; case BJ_Reserved: list = &transaction->t_reserved_list; break; } __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; if (transaction && is_journal_aborted(transaction->t_journal)) clear_buffer_jbddirty(bh); else if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ } /* * Remove buffer from all transactions. The caller is responsible for dropping * the jh reference that belonged to the transaction. * * Called with bh_state lock and j_list_lock */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { J_ASSERT_JH(jh, jh->b_transaction != NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); __brelse(bh); } /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @folio: Folio to detach data from. * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN * so try_to_free_buffers() can reap them. * * This function returns non-zero if we wish try_to_free_buffers() * to be called. We do this if the page is releasable by try_to_free_buffers(). * We also do it if the page has locked or dirty buffers and the caller wants * us to perform sync or async writeout. * * This complicates JBD locking somewhat. We aren't protected by the * BKL here. We wish to remove the buffer from its committing or * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. * * This may *change* the value of transaction_t->t_datalist, so anyone * who looks at t_datalist needs to lock against this function. * * Even worse, someone may be doing a jbd2_journal_dirty_data on this * buffer. So we need to lock against that. jbd2_journal_dirty_data() * will come out of the lock with the buffer dirty, which makes it * ineligible for release here. * * Who else is affected by this? hmm... Really the only contender * is do_get_write_access() - it could be looking at the buffer while * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? * * Return false on failure, true on success */ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) { struct buffer_head *head; struct buffer_head *bh; bool ret = false; J_ASSERT(folio_test_locked(folio)); head = folio_buffers(folio); bh = head; do { struct journal_head *jh; /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) continue; spin_lock(&jh->b_state_lock); if (!jh->b_transaction && !jh->b_next_transaction) { spin_lock(&journal->j_list_lock); /* Remove written-back checkpointed metadata buffer */ if (jh->b_cp_transaction != NULL) jbd2_journal_try_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); } spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(folio); busy: return ret; } /* * This buffer is no longer needed. If it is on an older transaction's * checkpoint list we need to record it on this transaction's forget list * to pin this buffer (and hence its checkpointing transaction) down until * this transaction commits. If the buffer isn't on a checkpoint list, we * release it. * Returns non-zero if JBD no longer has an interest in the buffer. * * Called under j_list_lock. * * Called under jh->b_state_lock. */ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { int may_free = 1; struct buffer_head *bh = jh2bh(jh); if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); __jbd2_journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in * __journal_file_buffer */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); __jbd2_journal_unfile_buffer(jh); jbd2_journal_put_journal_head(jh); } return may_free; } /* * jbd2_journal_invalidate_folio * * This code is tricky. It has a number of cases to deal with. * * There are two invariants which this code relies on: * * i_size must be updated on disk before we start calling invalidate_folio * on the data. * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this * invariant, we can be sure that it is safe to throw away any buffers * attached to the current transaction: once the transaction commits, * we know that the data will not be needed. * * Note however that we can *not* throw away data belonging to the * previous, committing transaction! * * Any disk blocks which *are* part of the previous, committing * transaction (and which therefore cannot be discarded immediately) are * not going to be reused in the new running transaction * * The bitmap committed_data images guarantee this: any block which is * allocated in one transaction and removed in the next will be marked * as in-use in the committed_data bitmap, so cannot be reused until * the next transaction to delete the block commits. This means that * leaving committing buffers dirty is quite safe: the disk blocks * cannot be reallocated to a different file and so buffer aliasing is * not possible. * * * The above applies mainly to ordered data mode. In writeback mode we * don't make guarantees about the order in which data hits disk --- in * particular we don't guarantee that new dirty data is flushed before * transaction commit --- so it is always safe just to discard data * immediately in that mode. --sct */ /* * The journal_unmap_buffer helper function returns zero if the buffer * concerned remains pinned as an anonymous buffer belonging to an older * transaction. * * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; BUFFER_TRACE(bh, "entry"); /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are * holding the page lock. --sct */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ write_lock(&journal->j_state_lock); spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); /* * We cannot remove the buffer from checkpoint lists until the * transaction adding inode to orphan list (let's call it T) * is committed. Otherwise if the transaction changing the * buffer would be cleaned from the journal before T is * committed, a crash will cause that the correct contents of * the buffer will be lost. On the other hand we have to * clear the buffer dirty bit at latest at the moment when the * transaction marking the buffer as freed in the filesystem * structures is committed because from that moment on the * block can be reallocated and used by a different page. * Since the block hasn't been freed yet but the inode has * already been added to orphan list, it is safe for us to add * the buffer to BJ_Forget list of the newest transaction. * * Also we have to clear buffer_mapped flag of a truncated buffer * because the buffer_head may be attached to the page straddling * i_size (can happen only when blocksize < pagesize) and thus the * buffer_head can be reused when the file is extended again. So we end * up keeping around invalidated buffers attached to transactions' * BJ_Forget list just to stop checkpointing code from cleaning up * the transaction this buffer was modified in. */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it * has no checkpoint link, then we can zap it: * it's a writeback-mode buffer so we don't care * if it hits disk safely. */ if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "not on any transaction: zap"); goto zap_buffer; } if (!buffer_dirty(bh)) { /* bdflush has written it. We can drop it now */ __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } /* OK, it must be in the journal but still not * written fully to disk: it's metadata or * journaled data... */ if (journal->j_running_transaction) { /* ... and once the current transaction has * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); may_free = __dispose_buffer(jh, journal->j_running_transaction); goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have * passed into commit. We must attach this buffer to * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); may_free = __dispose_buffer(jh, journal->j_committing_transaction); goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ clear_buffer_jbddirty(bh); __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); /* * The buffer is committing, we simply cannot touch * it. If the page is straddling i_size we have to wait * for commit and try again. */ if (partial_page) { spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); /* Already zapped buffer? Nothing to do... */ if (!bh->b_bdev) return 0; return -EBUSY; } /* * OK, buffer won't be reachable after truncate. We just clear * b_modified to not confuse transaction credit accounting, and * set j_next_transaction to the running transaction (if there * is one) and mark buffer as freed so that commit code knows * it should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; jh->b_modified = 0; spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); return 0; } else { /* Good, the buffer belongs to the running transaction. * We are writing our own transaction's data, not any * previous one's, so it is safe to throw it away * (remember that we expect the filesystem to have set * i_size already for this truncate so recovery will not * expose the disk blocks we are discarding here.) */ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); JBUFFER_TRACE(jh, "on running transaction"); may_free = __dispose_buffer(jh, transaction); } zap_buffer: /* * This is tricky. Although the buffer is truncated, it may be reused * if blocksize < pagesize and it is attached to the page straddling * EOF. Since the buffer might have been added to BJ_Forget list of the * running transaction, journal_get_write_access() won't clear * b_modified and credit accounting gets confused. So clear b_modified * here. */ jh->b_modified = 0; spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; } /** * jbd2_journal_invalidate_folio() * @journal: journal to use for flush... * @folio: folio to flush * @offset: start of the range to invalidate * @length: length of the range to invalidate * * Reap page buffers containing data after in the specified range in page. * Can return -EBUSY if buffers are part of the committing transaction and * the page is straddling i_size. Caller then has to wait for current commit * and try again. */ int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio, size_t offset, size_t length) { struct buffer_head *head, *bh, *next; unsigned int stop = offset + length; unsigned int curr_off = 0; int partial_page = (offset || length < folio_size(folio)); int may_free = 1; int ret = 0; if (!folio_test_locked(folio)) BUG(); head = folio_buffers(folio); if (!head) return 0; BUG_ON(stop > folio_size(folio) || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ bh = head; do { unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; if (next_off > stop) return 0; if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); ret = journal_unmap_buffer(journal, bh, partial_page); unlock_buffer(bh); if (ret < 0) return ret; may_free &= ret; } curr_off = next_off; bh = next; } while (bh != head); if (!partial_page) { if (may_free && try_to_free_buffers(folio)) J_ASSERT(!folio_buffers(folio)); } return 0; } /* * File a buffer on the given transaction list. */ void __jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { struct journal_head **list = NULL; int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); lockdep_assert_held(&jh->b_state_lock); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == NULL); if (jh->b_transaction && jh->b_jlist == jlist) return; if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { /* * For metadata buffers, we track dirty bit in buffer_jbddirty * instead of buffer_dirty. We should not see a dirty bit set * here because we clear it in do_get_write_access but e.g. * tune2fs can modify the sb and set the dirty bit at any time * so we try to gracefully handle that. */ if (buffer_dirty(bh)) warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; } if (jh->b_transaction) __jbd2_journal_temp_unlink_buffer(jh); else jbd2_journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { case BJ_None: J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; break; case BJ_Forget: list = &transaction->t_forget; break; case BJ_Shadow: list = &transaction->t_shadow_list; break; case BJ_Reserved: list = &transaction->t_reserved_list; break; } __blist_add_buffer(list, jh); jh->b_jlist = jlist; if (was_dirty) set_buffer_jbddirty(bh); } void jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { spin_lock(&jh->b_state_lock); spin_lock(&transaction->t_journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, jlist); spin_unlock(&transaction->t_journal->j_list_lock); spin_unlock(&jh->b_state_lock); } /* * Remove a buffer from its current buffer list in preparation for * dropping it from its current transaction entirely. If the buffer has * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * * Called under j_list_lock * Called under jh->b_state_lock * * When this function returns true, there's no next transaction to refile to * and the caller has to drop jh reference through * jbd2_journal_put_journal_head(). */ bool __jbd2_journal_refile_buffer(struct journal_head *jh) { int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); lockdep_assert_held(&jh->b_state_lock); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); /* If the buffer is now unused, just drop it. */ if (jh->b_next_transaction == NULL) { __jbd2_journal_unfile_buffer(jh); return true; } /* * It has been modified by a later transaction: add it to the new * transaction's metadata list. */ was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); /* * b_transaction must be set, otherwise the new b_transaction won't * be holding jh reference */ J_ASSERT_JH(jh, jh->b_transaction != NULL); /* * We set b_transaction here because b_next_transaction will inherit * our jh reference and thus __jbd2_journal_file_buffer() must not * take a new one. */ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction); WRITE_ONCE(jh->b_next_transaction, NULL); if (buffer_freed(bh)) jlist = BJ_Forget; else if (jh->b_modified) jlist = BJ_Metadata; else jlist = BJ_Reserved; __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) set_buffer_jbddirty(bh); return false; } /* * __jbd2_journal_refile_buffer() with necessary locking added. We take our * bh reference so that we can safely unlock bh. * * The jh and bh may be freed by this call. */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { bool drop; spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); drop = __jbd2_journal_refile_buffer(jh); spin_unlock(&jh->b_state_lock); spin_unlock(&journal->j_list_lock); if (drop) jbd2_journal_put_journal_head(jh); } /* * File inode in the inode list of the handle's transaction */ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, unsigned long flags, loff_t start_byte, loff_t end_byte) { transaction_t *transaction = handle->h_transaction; journal_t *journal; if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); spin_lock(&journal->j_list_lock); jinode->i_flags |= flags; if (jinode->i_dirty_end) { jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); } else { jinode->i_dirty_start = start_byte; jinode->i_dirty_end = end_byte; } /* Is inode already attached where we need it? */ if (jinode->i_transaction == transaction || jinode->i_next_transaction == transaction) goto done; /* * We only ever set this variable to 1 so the test is safe. Since * t_need_data_flush is likely to be set, we do the test to save some * cacheline bouncing */ if (!transaction->t_need_data_flush) transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { J_ASSERT(jinode->i_next_transaction == NULL); J_ASSERT(jinode->i_transaction == journal->j_committing_transaction); jinode->i_next_transaction = transaction; goto done; } /* Not on any transaction list... */ J_ASSERT(!jinode->i_next_transaction); jinode->i_transaction = transaction; list_add(&jinode->i_list, &transaction->t_inode_list); done: spin_unlock(&journal->j_list_lock); return 0; } int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { return jbd2_journal_file_inode(handle, jinode, JI_WRITE_DATA | JI_WAIT_DATA, start_byte, start_byte + length - 1); } int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, start_byte, start_byte + length - 1); } /* * File truncate and transaction commit interact with each other in a * non-trivial way. If a transaction writing data block A is * committing, we cannot discard the data by truncate until we have * written them. Otherwise if we crashed after the transaction with * write has committed but before the transaction with truncate has * committed, we could see stale data in block A. This function is a * helper to solve this problem. It starts writeout of the truncated * part in case it is in the committing transaction. * * Filesystem code must call this function when inode is journaled in * ordered mode before truncation happens and after the inode has been * placed on orphan list with the new inode size. The second condition * avoids the race that someone writes new data and we start * committing the transaction after this function has been called but * before a transaction for truncate is started (and furthermore it * allows us to optimize the case where the addition to orphan list * happens in the same transaction as write --- we don't have to write * any data in such case). */ int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *jinode, loff_t new_size) { transaction_t *inode_trans, *commit_trans; int ret = 0; /* This is a quick check to avoid locking if not necessary */ if (!jinode->i_transaction) goto out; /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started * a transaction adding the inode to orphan list */ read_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; read_unlock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); inode_trans = jinode->i_transaction; spin_unlock(&journal->j_list_lock); if (inode_trans == commit_trans) { ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); } out: return ret; } |
863 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_DST_OPS_H #define _NET_DST_OPS_H #include <linux/types.h> #include <linux/percpu_counter.h> #include <linux/cache.h> struct dst_entry; struct kmem_cachep; struct net_device; struct sk_buff; struct sock; struct net; struct dst_ops { unsigned short family; unsigned int gc_thresh; void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); u32 * (*cow_metrics)(struct dst_entry *, unsigned long); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void (*redirect)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); int (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb); struct neighbour * (*neigh_lookup)(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); void (*confirm_neigh)(const struct dst_entry *dst, const void *daddr); struct kmem_cache *kmem_cachep; struct percpu_counter pcpuc_entries ____cacheline_aligned_in_smp; }; static inline int dst_entries_get_fast(struct dst_ops *dst) { return percpu_counter_read_positive(&dst->pcpuc_entries); } static inline int dst_entries_get_slow(struct dst_ops *dst) { return percpu_counter_sum_positive(&dst->pcpuc_entries); } #define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { percpu_counter_add_batch(&dst->pcpuc_entries, val, DST_PERCPU_COUNTER_BATCH); } static inline int dst_entries_init(struct dst_ops *dst) { return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) { percpu_counter_destroy(&dst->pcpuc_entries); } #endif |
6056 2964 3056 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_BL_H #define _LINUX_LIST_BL_H #include <linux/list.h> #include <linux/bit_spinlock.h> /* * Special version of lists, where head of the list has a lock in the lowest * bit. This is useful for scalable hash tables without increasing memory * footprint overhead. * * For modification operations, the 0 bit of hlist_bl_head->first * pointer must be set. * * With some small modifications, this can easily be adapted to store several * arbitrary bits (not just a single lock bit), if the need arises to store * some fast and compact auxiliary data. */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL #endif #ifdef CONFIG_DEBUG_LIST #define LIST_BL_BUG_ON(x) BUG_ON(x) #else #define LIST_BL_BUG_ON(x) #endif struct hlist_bl_head { struct hlist_bl_node *first; }; struct hlist_bl_node { struct hlist_bl_node *next, **pprev; }; #define INIT_HLIST_BL_HEAD(ptr) \ ((ptr)->first = NULL) static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) { h->next = NULL; h->pprev = NULL; } #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)h->first & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; hlist_bl_set_first(h, n); } static inline void hlist_bl_add_before(struct hlist_bl_node *n, struct hlist_bl_node *next) { struct hlist_bl_node **pprev = next->pprev; n->pprev = pprev; n->next = next; next->pprev = &n->next; /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); } static inline void hlist_bl_add_behind(struct hlist_bl_node *n, struct hlist_bl_node *prev) { n->next = prev->next; n->pprev = &prev->next; prev->next = n; if (n->next) n->next->pprev = &n->next; } static inline void __hlist_bl_del(struct hlist_bl_node *n) { struct hlist_bl_node *next = n->next; struct hlist_bl_node **pprev = n->pprev; LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((unsigned long)next | ((unsigned long)*pprev & LIST_BL_LOCKMASK))); if (next) next->pprev = pprev; } static inline void hlist_bl_del(struct hlist_bl_node *n) { __hlist_bl_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_bl_del_init(struct hlist_bl_node *n) { if (!hlist_bl_unhashed(n)) { __hlist_bl_del(n); INIT_HLIST_BL_NODE(n); } } static inline void hlist_bl_lock(struct hlist_bl_head *b) { bit_spin_lock(0, (unsigned long *)b); } static inline void hlist_bl_unlock(struct hlist_bl_head *b) { __bit_spin_unlock(0, (unsigned long *)b); } static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) { return bit_spin_is_locked(0, (unsigned long *)b); } /** * hlist_bl_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_bl_for_each_entry(tpos, pos, head, member) \ for (pos = hlist_bl_first(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = hlist_bl_first(head); \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = n) #endif |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Directory notifications for Linux. * * Copyright (C) 2000,2001,2002 Stephen Rothwell * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * dnotify was largly rewritten to use the new fsnotify infrastructure */ #include <linux/fs.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/dnotify.h> #include <linux/init.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/fdtable.h> #include <linux/fsnotify_backend.h> static int dir_notify_enable __read_mostly = 1; #ifdef CONFIG_SYSCTL static struct ctl_table dnotify_sysctls[] = { { .procname = "dir-notify-enable", .data = &dir_notify_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, {} }; static void __init dnotify_sysctl_init(void) { register_sysctl_init("fs", dnotify_sysctls); } #else #define dnotify_sysctl_init() do { } while (0) #endif static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; static struct fsnotify_group *dnotify_group __read_mostly; /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which * is being watched by dnotify. If multiple userspace applications are watching * the same directory with dnotify their information is chained in dn */ struct dnotify_mark { struct fsnotify_mark fsn_mark; struct dnotify_struct *dn; }; /* * When a process starts or stops watching an inode the set of events which * dnotify cares about for that inode may change. This function runs the * list of everything receiving dnotify events about this directory and calculates * the set of all those events. After it updates what dnotify is interested in * it calls the fsnotify function so it can update the set of all events relevant * to this inode. */ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) { __u32 new_mask = 0; struct dnotify_struct *dn; struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); assert_spin_locked(&fsn_mark->lock); for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); if (fsn_mark->mask == new_mask) return; fsn_mark->mask = new_mask; fsnotify_recalc_mask(fsn_mark->connector); } /* * Mains fsnotify call where events are delivered to dnotify. * Find the dnotify mark on the relevant inode, run the list of dnotify structs * on that mark and determine which of them has expressed interest in receiving * events of this type. When found send the correct process and signal and * destroy the dnotify struct if it was not registered to receive multiple * events. */ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; /* not a dir, dnotify doesn't care */ if (!dir && !(mask & FS_ISDIR)) return 0; dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); spin_lock(&inode_mark->lock); prev = &dn_mark->dn; while ((dn = *prev) != NULL) { if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } fown = &dn->dn_filp->f_owner; send_sigio(fown, dn->dn_fd, POLL_MSG); if (dn->dn_mask & FS_DN_MULTISHOT) prev = &dn->dn_next; else { *prev = dn->dn_next; kmem_cache_free(dnotify_struct_cache, dn); dnotify_recalc_inode_mask(inode_mark); } } spin_unlock(&inode_mark->lock); return 0; } static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); BUG_ON(dn_mark->dn); kmem_cache_free(dnotify_mark_cache, dn_mark); } static const struct fsnotify_ops dnotify_fsnotify_ops = { .handle_inode_event = dnotify_handle_event, .free_mark = dnotify_free_mark, }; /* * Called every time a file is closed. Looks first for a dnotify mark on the * inode. If one is found run all of the ->dn structures attached to that * mark for one relevant to this process closing the file and remove that * dnotify_struct. If that was the last dnotify_struct also remove the * fsnotify_mark. */ void dnotify_flush(struct file *filp, fl_owner_t id) { struct fsnotify_mark *fsn_mark; struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; bool free = false; inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) return; fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); fsnotify_group_lock(dnotify_group); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; while ((dn = *prev) != NULL) { if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { *prev = dn->dn_next; kmem_cache_free(dnotify_struct_cache, dn); dnotify_recalc_inode_mask(fsn_mark); break; } prev = &dn->dn_next; } spin_unlock(&fsn_mark->lock); /* nothing else could have found us thanks to the dnotify_groups mark_mutex */ if (dn_mark->dn == NULL) { fsnotify_detach_mark(fsn_mark); free = true; } fsnotify_group_unlock(dnotify_group); if (free) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); } /* this conversion is done only at watch creation */ static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; if (arg & DN_MULTISHOT) new_mask |= FS_DN_MULTISHOT; if (arg & DN_DELETE) new_mask |= (FS_DELETE | FS_MOVED_FROM); if (arg & DN_MODIFY) new_mask |= FS_MODIFY; if (arg & DN_ACCESS) new_mask |= FS_ACCESS; if (arg & DN_ATTRIB) new_mask |= FS_ATTRIB; if (arg & DN_RENAME) new_mask |= FS_RENAME; if (arg & DN_CREATE) new_mask |= (FS_CREATE | FS_MOVED_TO); return new_mask; } /* * If multiple processes watch the same inode with dnotify there is only one * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct * onto that mark. This function either attaches the new dnotify_struct onto * that list, or it |= the mask onto an existing dnofiy_struct. */ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, fl_owner_t id, int fd, struct file *filp, __u32 mask) { struct dnotify_struct *odn; odn = dn_mark->dn; while (odn != NULL) { /* adding more events to existing dnofiy_struct? */ if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { odn->dn_fd = fd; odn->dn_mask |= mask; return -EEXIST; } odn = odn->dn_next; } dn->dn_mask = mask; dn->dn_fd = fd; dn->dn_filp = filp; dn->dn_owner = id; dn->dn_next = dn_mark->dn; dn_mark->dn = dn; return 0; } /* * When a process calls fcntl to attach a dnotify watch to a directory it ends * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; struct file *f; int destroy = 0, error = 0; __u32 mask; /* we use these to tell if we need to kfree */ new_fsn_mark = NULL; dn = NULL; if (!dir_notify_enable) { error = -EINVAL; goto out_err; } /* a 0 mask means we are explicitly removing the watch */ if ((arg & ~DN_MULTISHOT) == 0) { dnotify_flush(filp, id); error = 0; goto out_err; } /* dnotify only works on directories */ inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) { error = -ENOTDIR; goto out_err; } /* * convert the userspace DN_* "arg" to the internal FS_* * defined in fsnotify */ mask = convert_arg(arg); error = security_path_notify(&filp->f_path, mask, FSNOTIFY_OBJ_TYPE_INODE); if (error) goto out_err; /* expect most fcntl to add new rather than augment old */ dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); if (!dn) { error = -ENOMEM; goto out_err; } /* new fsnotify mark, we expect most fcntl calls to add a new mark */ new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL); if (!new_dn_mark) { error = -ENOMEM; goto out_err; } /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); new_fsn_mark->mask = mask; new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0); if (error) { fsnotify_group_unlock(dnotify_group); goto out_err; } spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; /* we used new_fsn_mark, so don't free it */ new_fsn_mark = NULL; } rcu_read_lock(); f = lookup_fd_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the * fd is the only time we clean up the marks we need to get our mark * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too * since multiple calls to destroy_mark is perfectly safe, if * we found a dn_mark already attached to the inode, just sod * off silently as the flush at close time dealt with it. */ if (dn_mark == new_dn_mark) destroy = 1; error = 0; goto out; } __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ if (!error) dn = NULL; /* -EEXIST means that we didn't add this new dn and used an old one. * that isn't an error (and the unused dn should be freed) */ else if (error == -EEXIST) error = 0; dnotify_recalc_inode_mask(fsn_mark); out: spin_unlock(&fsn_mark->lock); if (destroy) fsnotify_detach_mark(fsn_mark); fsnotify_group_unlock(dnotify_group); if (destroy) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); return error; } static int __init dnotify_init(void) { dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC|SLAB_ACCOUNT); dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, FSNOTIFY_GROUP_NOFS); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); dnotify_sysctl_init(); return 0; } module_init(dnotify_init) |
5943 18208 15128 15118 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu.xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif |
2 2 2 2 6 6 5 5 2 3 3 2 5 5 4 4 2 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 | // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/bpf-netns.h> #include <linux/filter.h> #include <net/net_namespace.h> /* * Functions to manage BPF programs attached to netns */ struct bpf_netns_link { struct bpf_link link; enum bpf_attach_type type; enum netns_bpf_attach_type netns_type; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet * pre_exit callback to clear this pointer. Must be accessed * with netns_bpf_mutex held. */ struct net *net; struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_dec(&bpf_sk_lookup_enabled); break; #endif default: break; } } static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_inc(&bpf_sk_lookup_enabled); break; #endif default: break; } } /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) { struct bpf_prog_array *run_array; run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); } static int link_index(struct net *net, enum netns_bpf_attach_type type, struct bpf_netns_link *link) { struct bpf_netns_link *pos; int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { if (pos == link) return i; i++; } return -ENOENT; } static int link_count(struct net *net, enum netns_bpf_attach_type type) { struct list_head *pos; int i = 0; list_for_each(pos, &net->bpf.links[type]) i++; return i; } static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog_array *prog_array) { struct bpf_netns_link *pos; unsigned int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { prog_array->items[i].prog = pos->link.prog; i++; } } static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *old_array, *new_array; struct net *net; int cnt, idx; mutex_lock(&netns_bpf_mutex); /* We can race with cleanup_net, but if we see a non-NULL * struct net pointer, pre_exit has not run yet and wait for * netns_bpf_mutex. */ net = net_link->net; if (!net) goto out_unlock; /* Mark attach point as unused */ netns_bpf_attach_type_unneed(type); /* Remember link position in case of safe delete */ idx = link_index(net, type, net_link); list_del(&net_link->node); cnt = link_count(net, type); if (!cnt) { netns_bpf_run_array_detach(net, type); goto out_unlock; } old_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); if (!new_array) { WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); goto out_unlock; } fill_prog_array(net, type, new_array); rcu_assign_pointer(net->bpf.run_array[type], new_array); bpf_prog_array_free(old_array); out_unlock: net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } static int bpf_netns_link_detach(struct bpf_link *link) { bpf_netns_link_release(link); return 0; } static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); kfree(net_link); } static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; if (new_prog->type != link->prog->type) return -EINVAL; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (!net || !check_net(net)) { /* Link auto-detached or netns dying */ ret = -ENOLINK; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); idx = link_index(net, type, net_link); ret = bpf_prog_array_update_at(run_array, idx, new_prog); if (ret) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } static int bpf_netns_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); unsigned int inum = 0; struct net *net; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (net && check_net(net)) inum = net->ns.inum; mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; info->netns.attach_type = net_link->type; return 0; } static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_link_info info = {}; bpf_netns_link_fill_info(link, &info); seq_printf(seq, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, info.netns.attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, }; /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct net *net, enum netns_bpf_attach_type type) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); struct bpf_prog_array *run_array; u32 prog_cnt = 0, flags = 0; run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) prog_cnt = bpf_prog_array_length(run_array); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) return -EFAULT; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) return 0; return bpf_prog_array_copy_to_user(run_array, prog_ids, attr->query.prog_cnt); } int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { enum netns_bpf_attach_type type; struct net *net; int ret; if (attr->query.query_flags) return -EINVAL; type = to_netns_bpf_attach_type(attr->query.attach_type); if (type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_query(attr, uattr, net, type); mutex_unlock(&netns_bpf_mutex); put_net(net); return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; int ret; if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } if (ret) goto out_unlock; attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) { WRITE_ONCE(run_array->items[0].prog, prog); } else { run_array = bpf_prog_array_alloc(1, GFP_KERNEL); if (!run_array) { ret = -ENOMEM; goto out_unlock; } run_array->items[0].prog = prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); } net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog *old) { struct bpf_prog *attached; /* Progs attached via links cannot be detached */ if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; struct bpf_prog *prog; int ret; if (attr->target_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); bpf_prog_put(prog); return ret; } static int netns_bpf_max_progs(enum netns_bpf_attach_type type) { switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; case NETNS_BPF_SK_LOOKUP: return 64; default: return 0; } } static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int cnt, err; mutex_lock(&netns_bpf_mutex); cnt = link_count(net, type); if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } /* Links are not compatible with attaching prog directly */ if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; case NETNS_BPF_SK_LOOKUP: err = 0; /* nothing to check */ break; default: err = -EINVAL; break; } if (err) goto out_unlock; run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } list_add_tail(&net_link->node, &net->bpf.links[type]); fill_prog_array(net, type, run_array); run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); /* Mark attach point as used */ netns_bpf_attach_type_need(type); out_unlock: mutex_unlock(&netns_bpf_mutex); return err; } int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type netns_type; struct bpf_link_primer link_primer; struct bpf_netns_link *net_link; enum bpf_attach_type type; struct net *net; int err; if (attr->link_create.flags) return -EINVAL; type = attr->link_create.attach_type; netns_type = to_netns_bpf_attach_type(type); if (netns_type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->link_create.target_fd); if (IS_ERR(net)) return PTR_ERR(net); net_link = kzalloc(sizeof(*net_link), GFP_USER); if (!net_link) { err = -ENOMEM; goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, &bpf_netns_link_ops, prog); net_link->net = net; net_link->type = type; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); if (err) { kfree(net_link); goto out_put_net; } err = netns_bpf_link_attach(net, &net_link->link, netns_type); if (err) { bpf_link_cleanup(&link_primer); goto out_put_net; } put_net(net); return bpf_link_settle(&link_primer); out_put_net: put_net(net); return err; } static int __net_init netns_bpf_pernet_init(struct net *net) { int type; for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) INIT_LIST_HEAD(&net->bpf.links[type]); return 0; } static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ netns_bpf_attach_type_unneed(type); } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; static int __init netns_bpf_init(void) { return register_pernet_subsys(&netns_bpf_pernet_ops); } subsys_initcall(netns_bpf_init); |
1239 1240 1240 1240 1240 1239 1215 430 427 426 426 427 427 427 427 427 426 425 425 425 425 431 425 423 423 423 89 89 4857 447 4667 4116 4052 4051 4624 4313 4667 4667 4665 4667 4667 4466 4667 2013 1783 4667 4098 4096 4667 641 4667 3926 4667 4667 4667 4660 4096 4857 4857 69 69 69 5173 577 577 5392 5393 5171 5393 5393 5393 2559 5393 3872 5392 3 2 5391 577 5391 5390 3449 5391 2518 5391 3873 94 2556 2475 5390 3929 5391 5391 5390 5391 5391 5391 5391 5393 5393 5337 5390 5293 5293 1165 4786 5236 5392 913 915 913 911 912 3 937 868 868 3926 3926 1777 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include <linux/filelock.h> #include "internal.h" int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; long error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; error = mnt_want_write(path->mnt); if (error) goto out; idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_truncate); long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; struct fd f; int error; error = -EINVAL; if (length < 0) goto out; error = -EBADF; f = fdget(fd); if (!f.file) goto out; /* explicitly opened as large or we are on 64-bit box */ if (f.file->f_flags & O_LARGEFILE) small = 0; dentry = f.file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) goto out_putf; error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) goto out_putf; error = -EPERM; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); error = security_file_truncate(f.file); if (!error) error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, compat_arg_u64_dual(length)) { return ksys_truncate(pathname, compat_arg_u64_glue(length)); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, compat_arg_u64_dual(length)) { return ksys_ftruncate(fd, compat_arg_u64_glue(length)); } #endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; if (offset < 0 || len <= 0) return -EINVAL; /* Return error if mode is not supported */ if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; /* Punch hole must have keep size set */ if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* Collapse range should only be used exclusively. */ if ((mode & FALLOC_FL_COLLAPSE_RANGE) && (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; /* Insert range should only be used exclusively. */ if ((mode & FALLOC_FL_INSERT_RANGE) && (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; /* Unshare range should only be used with allocate mode. */ if ((mode & FALLOC_FL_UNSHARE_RANGE) && (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * We can only allow pure fallocate on append only files */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { struct fd f = fdget(fd); int error = -EBADF; if (f.file) { error = vfs_fallocate(f.file, mode, offset, len); fdput(f); } return error; } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), compat_arg_u64_dual(len)) { return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), compat_arg_u64_glue(len)); } #endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. * * Creating new credentials is expensive, so we try to skip doing it, * which we can if the result would match what we already got. */ static bool access_need_override_creds(int flags) { const struct cred *cred; if (flags & AT_EACCESS) return false; cred = current_cred(); if (!uid_eq(cred->fsuid, cred->uid) || !gid_eq(cred->fsgid, cred->gid)) return true; if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(cred->user_ns, 0); if (!uid_eq(cred->uid, root_uid)) { if (!cap_isclear(cred->cap_effective)) return true; } else { if (!cap_isidentical(cred->cap_effective, cred->cap_permitted)) return true; } } return false; } static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; /* * XXX access_need_override_creds performs checks in hopes of skipping * this work. Make sure it stays in sync if making any changes in this * routine. */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-RCY. */ override_cred->non_rcu = 1; old_cred = override_creds(override_cred); /* override_cred() gets its own ref */ put_cred(override_cred); return old_cred; } static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) revert_creds(old_cred); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); int error; error = -EBADF; if (!f.file) goto out; error = -ENOTDIR; if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { struct fd f = fdget(fd); int err = -EBADF; if (f.file) { err = vfs_fchmod(f.file, mode); fdput(f); } return err; } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, unsigned int flags) { struct path path; int error; unsigned int lookup_flags; if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) return -EINVAL; lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, flags) { return do_fchmodat(dfd, filename, mode, flags); } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * * Return: true if @kuid is valid, false if not. */ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { if (!uid_valid(kuid)) return false; attr->ia_valid |= ATTR_UID; attr->ia_vfsuid = VFSUIDT_INIT(kuid); return true; } /* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * * Return: true if @kgid is valid, false if not. */ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { if (!gid_valid(kgid)) return false; attr->ia_valid |= ATTR_GID; attr->ia_vfsgid = VFSGIDT_INIT(kgid); return true; } int chown_common(const struct path *path, uid_t user, gid_t group) { struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: newattrs.ia_vfsuid = INVALID_VFSUID; newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; if (f.file) { error = vfs_fchown(f.file, user, group); fdput(f); } return error; } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; error = __mnt_want_write(f->f_path.mnt); if (unlikely(error)) { put_write_access(inode); goto cleanup_file; } f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Paired with smp_mb() in collapse_file() to ensure nr_thps * is up to date and the update to i_writecount by * get_write_access() is visible. Ensures subsequent insertion * of THPs into the page cache will fail. */ smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } /* * Once we return a file with FMODE_OPENED, __fput() will call * fsnotify_close(), so we need fsnotify_open() here for symmetry. */ fsnotify_open(f); return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; return do_dentry_open(file, d_backing_inode(dentry), open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized */ int vfs_open(const struct path *path, struct file *file) { file->f_path = *path; return do_dentry_open(file, d_backing_inode(path->dentry), NULL); } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; validate_creds(cred); /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); /** * dentry_create - Create and open a file * @path: path to create * @flags: O_ flags * @mode: mode bits for new file * @cred: credentials to use * * Caller must hold the parent directory's lock, and have prepared * a negative dentry, placed in @path->dentry, for the new file. * * Caller sets @path->mnt to the vfsmount of the filesystem where * the new file is to be created. The parent directory and the * negative dentry must reside on the same filesystem instance. * * On success, returns a "struct file *". Otherwise a ERR_PTR * is returned. */ struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred) { struct file *f; int error; validate_creds(cred); f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) error = vfs_open(path, f); if (unlikely(error)) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL(dentry_create); /** * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags * @inode: the inode * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted * against nr_files and must not be installed into the file descriptor * table. * * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file_noaccount(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; error = do_dentry_open(f, inode, NULL); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(kernel_file_open); /** * backing_file_open - open a backing file for kernel internal use * @path: path of the file to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). * @path may be on the stackable filesystem and backing inode on the * underlying filesystem. In this case, we want to be able to return * the @real_path of the backing inode. This is done by embedding the * returned file into a container structure that also stores the path of * the backing inode on the underlying filesystem, which can be * retrieved using backing_file_real_path(). */ struct file *backing_file_open(const struct path *path, int flags, const struct path *real_path, const struct cred *cred) { struct file *f; int error; f = alloc_empty_backing_file(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; path_get(real_path); *backing_file_real_path(f) = *real_path; error = do_dentry_open(f, d_inode(real_path->dentry), NULL); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(backing_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that either shouldn't be set by userspace like * FMODE_NONOTIFY or that aren't relevant in determining struct * open_flags like O_CLOEXEC. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * Block bugs where O_DIRECTORY | O_CREAT created regular files. * Note, that blocking O_DIRECTORY | O_CREAT here also protects * O_TMPFILE below which requires O_DIRECTORY being raised. */ if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return -EINVAL; /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { /* * In order to ensure programs get explicit errors when trying * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY * is raised alongside __O_TMPFILE. */ if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static long do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fd_install(fd, f); } } putname(tmp); return fd; } long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; audit_openat2_how(&tmp); /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } return retval; } int filp_close(struct file *filp, fl_owner_t id) { int retval; retval = filp_flush(filp, id); fput(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval; struct file *file; file = close_fd_get_file(fd); if (!file) return -EBADF; retval = filp_flush(file, current->files); /* * We're returning to user space. Don't bother * with any delayed fput() cases. */ __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; return retval; } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: reserved for future extensions * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { return __close_range(fd, max_fd, flags); } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open); |
6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 27 27 27 27 27 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 | // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management * Copyright 2020 - 2022 Intel Corporation */ #include <linux/nl80211.h> #include <linux/export.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; int num = 0; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) num++; return num; } static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; int num = 0; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) num++; return num; } int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { return ieee80211_chanctx_num_assigned(local, ctx) + ieee80211_chanctx_num_reserved(local, ctx); } static int ieee80211_num_chanctx(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx; int num = 0; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) num++; return num; } static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local) { lockdep_assert_held(&local->chanctx_mtx); return ieee80211_num_chanctx(local) < ieee80211_max_num_channels(local); } static struct ieee80211_chanctx * ieee80211_link_get_chanctx(struct ieee80211_link_data *link) { struct ieee80211_local *local __maybe_unused = link->sdata->local; struct ieee80211_chanctx_conf *conf; conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) return NULL; return container_of(conf, struct ieee80211_chanctx, conf); } static const struct cfg80211_chan_def * ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *compat) { struct ieee80211_link_data *link; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { if (!compat) compat = &link->reserved_chandef; compat = cfg80211_chandef_compatible(&link->reserved_chandef, compat); if (!compat) break; } return compat; } static const struct cfg80211_chan_def * ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *compat) { struct ieee80211_link_data *link; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { struct ieee80211_bss_conf *link_conf = link->conf; if (link->reserved_chanctx) continue; if (!compat) compat = &link_conf->chandef; compat = cfg80211_chandef_compatible( &link_conf->chandef, compat); if (!compat) break; } return compat; } static const struct cfg80211_chan_def * ieee80211_chanctx_combined_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *compat) { lockdep_assert_held(&local->chanctx_mtx); compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat); if (!compat) return NULL; compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat); if (!compat) return NULL; return compat; } static bool ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *def) { lockdep_assert_held(&local->chanctx_mtx); if (ieee80211_chanctx_combined_chandef(local, ctx, def)) return true; if (!list_empty(&ctx->reserved_links) && ieee80211_chanctx_reserved_chandef(local, ctx, def)) return true; return false; } static struct ieee80211_chanctx * ieee80211_find_reservation_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); if (mode == IEEE80211_CHANCTX_EXCLUSIVE) return NULL; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; if (!ieee80211_chanctx_can_reserve_chandef(local, ctx, chandef)) continue; return ctx; } return NULL; } static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta, unsigned int link_id) { enum ieee80211_sta_rx_bandwidth width; struct link_sta_info *link_sta; link_sta = rcu_dereference(sta->link[link_id]); /* no effect if this STA has no presence on this link */ if (!link_sta) return NL80211_CHAN_WIDTH_20_NOHT; width = ieee80211_sta_cap_rx_bw(link_sta); switch (width) { case IEEE80211_STA_RX_BW_20: if (link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20; else return NL80211_CHAN_WIDTH_20_NOHT; case IEEE80211_STA_RX_BW_40: return NL80211_CHAN_WIDTH_40; case IEEE80211_STA_RX_BW_80: return NL80211_CHAN_WIDTH_80; case IEEE80211_STA_RX_BW_160: /* * This applied for both 160 and 80+80. since we use * the returned value to consider degradation of * ctx->conf.min_def, we have to make sure to take * the bigger one (NL80211_CHAN_WIDTH_160). * Otherwise we might try degrading even when not * needed, as the max required sta_bw returned (80+80) * might be smaller than the configured bw (160). */ return NL80211_CHAN_WIDTH_160; case IEEE80211_STA_RX_BW_320: return NL80211_CHAN_WIDTH_320; default: WARN_ON(1); return NL80211_CHAN_WIDTH_20; } } static enum nl80211_chan_width ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata, unsigned int link_id) { enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct sta_info *sta; list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { if (sdata != sta->sdata && !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; max_bw = max(max_bw, ieee80211_get_sta_bw(sta, link_id)); } return max_bw; } static enum nl80211_chan_width ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_vif *vif = &sdata->vif; int link_id; rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_link_data *link = rcu_dereference(sdata->link[link_id]); if (!link) continue; if (link != rsvd_for && rcu_access_pointer(link->conf->chanctx_conf) != &ctx->conf) continue; switch (vif->type) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: width = ieee80211_get_max_required_bw(sdata, link_id); break; case NL80211_IFTYPE_STATION: /* * The ap's sta->bandwidth is not set yet at this * point, so take the width from the chandef, but * account also for TDLS peers */ width = max(link->conf->chandef.width, ieee80211_get_max_required_bw(sdata, link_id)); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: width = link->conf->chandef.width; break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: WARN_ON_ONCE(1); } max_bw = max(max_bw, width); } rcu_read_unlock(); return max_bw; } static enum nl80211_chan_width ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { struct ieee80211_sub_if_data *sdata; enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { enum nl80211_chan_width width; if (!ieee80211_sdata_running(sdata)) continue; width = ieee80211_get_chanctx_vif_max_required_bw(sdata, ctx, rsvd_for); max_bw = max(max_bw, width); } /* use the configured bandwidth in case of monitor interface */ sdata = rcu_dereference(local->monitor_sdata); if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &ctx->conf) max_bw = max(max_bw, ctx->conf.def.width); rcu_read_unlock(); return max_bw; } /* * recalc the min required chan width of the channel context, which is * the max of min required widths of all the interfaces bound to this * channel context. */ static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; lockdep_assert_held(&local->chanctx_mtx); /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return 0; } max_bw = ieee80211_get_chanctx_max_required_bw(local, ctx, rsvd_for); /* downgrade chandef up to max_bw */ min_def = ctx->conf.def; while (min_def.width > max_bw) ieee80211_chandef_downgrade(&min_def); if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def)) return 0; ctx->conf.min_def = min_def; if (!ctx->driver_present) return 0; return IEEE80211_CHANCTX_CHANGE_MIN_WIDTH; } /* calling this function is assuming that station vif is updated to * lates changes by calling ieee80211_link_update_chandef */ static void ieee80211_chan_bw_change(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, bool narrowed) { struct sta_info *sta; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[ctx->conf.def.chan->band]; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { struct ieee80211_sub_if_data *sdata = sta->sdata; enum ieee80211_sta_rx_bandwidth new_sta_bw; unsigned int link_id; if (!ieee80211_sdata_running(sta->sdata)) continue; for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) { struct ieee80211_bss_conf *link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); struct link_sta_info *link_sta; if (!link_conf) continue; if (rcu_access_pointer(link_conf->chanctx_conf) != &ctx->conf) continue; link_sta = rcu_dereference(sta->link[link_id]); if (!link_sta) continue; new_sta_bw = ieee80211_sta_cur_vht_bw(link_sta); /* nothing change */ if (new_sta_bw == link_sta->pub->bandwidth) continue; /* vif changed to narrow BW and narrow BW for station wasn't * requested or vise versa */ if ((new_sta_bw < link_sta->pub->bandwidth) == !narrowed) continue; link_sta->pub->bandwidth = new_sta_bw; rate_control_rate_update(local, sband, sta, link_id, IEEE80211_RC_BW_CHANGED); } } rcu_read_unlock(); } /* * recalc the min required chan width of the channel context, which is * the max of min required widths of all the interfaces bound to this * channel context. */ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); if (!changed) return; /* check is BW narrowed */ ieee80211_chan_bw_change(local, ctx, true); drv_change_chanctx(local, ctx, changed); /* check is BW wider */ ieee80211_chan_bw_change(local, ctx, false); } static void _ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, const struct cfg80211_chan_def *chandef, struct ieee80211_link_data *rsvd_for) { u32 changed; /* expected to handle only 20/40/80/160/320 channel widths */ switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_320: break; default: WARN_ON(1); } /* Check maybe BW narrowed - we do this _before_ calling recalc_chanctx_min_def * due to maybe not returning from it, e.g in case new context was added * first time with all parameters up to date. */ ieee80211_chan_bw_change(local, old_ctx, true); if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); return; } WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); ctx->conf.def = *chandef; /* check if min chanctx also changed */ changed = IEEE80211_CHANCTX_CHANGE_WIDTH | _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); drv_change_chanctx(local, ctx, changed); if (!local->use_chanctx) { local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } /* check is BW wider */ ieee80211_chan_bw_change(local, old_ctx, false); } static void ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, const struct cfg80211_chan_def *chandef) { _ieee80211_change_chanctx(local, ctx, old_ctx, chandef, NULL); } static struct ieee80211_chanctx * ieee80211_find_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); if (mode == IEEE80211_CHANCTX_EXCLUSIVE) return NULL; list_for_each_entry(ctx, &local->chanctx_list, list) { const struct cfg80211_chan_def *compat; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE) continue; if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; compat = cfg80211_chandef_compatible(&ctx->conf.def, chandef); if (!compat) continue; compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat); if (!compat) continue; ieee80211_change_chanctx(local, ctx, ctx, compat); return ctx; } return NULL; } bool ieee80211_is_radar_required(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_held(&local->mtx); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = rcu_dereference(sdata->link[link_id]); if (link && link->radar_required) { rcu_read_unlock(); return true; } } } rcu_read_unlock(); return false; } static bool ieee80211_chanctx_radar_required(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; struct ieee80211_sub_if_data *sdata; bool required = false; lockdep_assert_held(&local->chanctx_mtx); lockdep_assert_held(&local->mtx); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { unsigned int link_id; if (!ieee80211_sdata_running(sdata)) continue; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = rcu_dereference(sdata->link[link_id]); if (!link) continue; if (rcu_access_pointer(link->conf->chanctx_conf) != conf) continue; if (!link->radar_required) continue; required = true; break; } if (required) break; } rcu_read_unlock(); return required; } static struct ieee80211_chanctx * ieee80211_alloc_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); ctx = kzalloc(sizeof(*ctx) + local->hw.chanctx_data_size, GFP_KERNEL); if (!ctx) return NULL; INIT_LIST_HEAD(&ctx->assigned_links); INIT_LIST_HEAD(&ctx->reserved_links); ctx->conf.def = *chandef; ctx->conf.rx_chains_static = 1; ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; ctx->conf.radar_enabled = false; _ieee80211_recalc_chanctx_min_def(local, ctx, NULL); return ctx; } static int ieee80211_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { u32 changed; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); if (!local->use_chanctx) local->hw.conf.radar_enabled = ctx->conf.radar_enabled; /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) ieee80211_hw_config(local, changed); if (!local->use_chanctx) { local->_oper_chandef = ctx->conf.def; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { err = drv_add_chanctx(local, ctx); if (err) { ieee80211_recalc_idle(local); return err; } } return 0; } static struct ieee80211_chanctx * ieee80211_new_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); ctx = ieee80211_alloc_chanctx(local, chandef, mode); if (!ctx) return ERR_PTR(-ENOMEM); err = ieee80211_add_chanctx(local, ctx); if (err) { kfree(ctx); return ERR_PTR(err); } list_add_rcu(&ctx->list, &local->chanctx_list); return ctx; } static void ieee80211_del_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { lockdep_assert_held(&local->chanctx_mtx); if (!local->use_chanctx) { struct cfg80211_chan_def *chandef = &local->_oper_chandef; /* S1G doesn't have 20MHz, so get the correct width for the * current channel. */ if (chandef->chan->band == NL80211_BAND_S1GHZ) chandef->width = ieee80211_s1g_channel_width(chandef->chan); else chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = chandef->chan->center_freq; chandef->freq1_offset = chandef->chan->freq_offset; chandef->center_freq2 = 0; /* NOTE: Disabling radar is only valid here for * single channel context. To be sure, check it ... */ WARN_ON(local->hw.conf.radar_enabled && !list_empty(&local->chanctx_list)); local->hw.conf.radar_enabled = false; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { drv_remove_chanctx(local, ctx); } ieee80211_recalc_idle(local); } static void ieee80211_free_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { lockdep_assert_held(&local->chanctx_mtx); WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0); list_del_rcu(&ctx->list); ieee80211_del_chanctx(local, ctx); kfree_rcu(ctx, rcu_head); } void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; struct ieee80211_sub_if_data *sdata; const struct cfg80211_chan_def *compat = NULL; struct sta_info *sta; lockdep_assert_held(&local->chanctx_mtx); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int link_id; if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_bss_conf *link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (!link_conf) continue; if (rcu_access_pointer(link_conf->chanctx_conf) != conf) continue; if (!compat) compat = &link_conf->chandef; compat = cfg80211_chandef_compatible(&link_conf->chandef, compat); if (WARN_ON_ONCE(!compat)) break; } } if (WARN_ON_ONCE(!compat)) { rcu_read_unlock(); return; } /* TDLS peers can sometimes affect the chandef width */ list_for_each_entry_rcu(sta, &local->sta_list, list) { if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->tdls_chandef.chan) continue; compat = cfg80211_chandef_compatible(&sta->tdls_chandef, compat); if (WARN_ON_ONCE(!compat)) break; } rcu_read_unlock(); if (!compat) return; ieee80211_change_chanctx(local, ctx, ctx, compat); } static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { bool radar_enabled; lockdep_assert_held(&local->chanctx_mtx); /* for ieee80211_is_radar_required */ lockdep_assert_held(&local->mtx); radar_enabled = ieee80211_chanctx_radar_required(local, chanctx); if (radar_enabled == chanctx->conf.radar_enabled) return; chanctx->conf.radar_enabled = radar_enabled; if (!local->use_chanctx) { local->hw.conf.radar_enabled = chanctx->conf.radar_enabled; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR); } static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, struct ieee80211_chanctx *new_ctx) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *curr_ctx = NULL; int ret = 0; if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) return -ENOTSUPP; conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (conf) { curr_ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx); conf = NULL; list_del(&link->assigned_chanctx_list); } if (new_ctx) { /* recalc considering the link we'll use it for now */ ieee80211_recalc_chanctx_min_def(local, new_ctx, link); ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); if (ret) goto out; conf = &new_ctx->conf; list_add(&link->assigned_chanctx_list, &new_ctx->assigned_links); } out: rcu_assign_pointer(link->conf->chanctx_conf, conf); sdata->vif.cfg.idle = !conf; if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) { ieee80211_recalc_chanctx_chantype(local, curr_ctx); ieee80211_recalc_smps_chanctx(local, curr_ctx); ieee80211_recalc_radar_chanctx(local, curr_ctx); ieee80211_recalc_chanctx_min_def(local, curr_ctx, NULL); } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { ieee80211_recalc_txpower(sdata, false); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); } if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_MONITOR) ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_IDLE); ieee80211_check_fast_xmit_iface(sdata); return ret; } void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { struct ieee80211_sub_if_data *sdata; u8 rx_chains_static, rx_chains_dynamic; lockdep_assert_held(&local->chanctx_mtx); rx_chains_static = 1; rx_chains_dynamic = 1; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { u8 needed_static, needed_dynamic; unsigned int link_id; if (!ieee80211_sdata_running(sdata)) continue; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!sdata->u.mgd.associated) continue; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: break; default: continue; } for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = rcu_dereference(sdata->link[link_id]); if (!link) continue; if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) continue; switch (link->smps_mode) { default: WARN_ONCE(1, "Invalid SMPS mode %d\n", link->smps_mode); fallthrough; case IEEE80211_SMPS_OFF: needed_static = link->needed_rx_chains; needed_dynamic = link->needed_rx_chains; break; case IEEE80211_SMPS_DYNAMIC: needed_static = 1; needed_dynamic = link->needed_rx_chains; break; case IEEE80211_SMPS_STATIC: needed_static = 1; needed_dynamic = 1; break; } rx_chains_static = max(rx_chains_static, needed_static); rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic); } } /* Disable SMPS for the monitor interface */ sdata = rcu_dereference(local->monitor_sdata); if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf) rx_chains_dynamic = rx_chains_static = local->rx_chains; rcu_read_unlock(); if (!local->use_chanctx) { if (rx_chains_static > 1) local->smps_mode = IEEE80211_SMPS_OFF; else if (rx_chains_dynamic > 1) local->smps_mode = IEEE80211_SMPS_DYNAMIC; else local->smps_mode = IEEE80211_SMPS_STATIC; ieee80211_hw_config(local, 0); } if (rx_chains_static == chanctx->conf.rx_chains_static && rx_chains_dynamic == chanctx->conf.rx_chains_dynamic) return; chanctx->conf.rx_chains_static = rx_chains_static; chanctx->conf.rx_chains_dynamic = rx_chains_dynamic; drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RX_CHAINS); } static void __ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, bool clear) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_sub_if_data *vlan; struct ieee80211_chanctx_conf *conf; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP)) return; lockdep_assert_held(&local->mtx); /* Check that conf exists, even when clearing this function * must be called with the AP's channel context still there * as it would otherwise cause VLANs to have an invalid * channel context pointer for a while, possibly pointing * to a channel context that has already been freed. */ conf = rcu_dereference_protected(link_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); WARN_ON(!conf); if (clear) conf = NULL; rcu_read_lock(); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { struct ieee80211_bss_conf *vlan_conf; vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]); if (WARN_ON(!vlan_conf)) continue; rcu_assign_pointer(vlan_conf->chanctx_conf, conf); } rcu_read_unlock(); } void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, bool clear) { struct ieee80211_local *local = link->sdata->local; mutex_lock(&local->chanctx_mtx); __ieee80211_link_copy_chanctx_to_vlans(link, clear); mutex_unlock(&local->chanctx_mtx); } int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx *ctx = link->reserved_chanctx; lockdep_assert_held(&sdata->local->chanctx_mtx); if (WARN_ON(!ctx)) return -EINVAL; list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { if (WARN_ON(!ctx->replace_ctx)) return -EINVAL; WARN_ON(ctx->replace_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED); WARN_ON(ctx->replace_ctx->replace_ctx != ctx); ctx->replace_ctx->replace_ctx = NULL; ctx->replace_ctx->replace_state = IEEE80211_CHANCTX_REPLACE_NONE; list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); } else { ieee80211_free_chanctx(sdata->local, ctx); } } return 0; } int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode, bool radar_required) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *new_ctx, *curr_ctx, *ctx; lockdep_assert_held(&local->chanctx_mtx); curr_ctx = ieee80211_link_get_chanctx(link); if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx) return -ENOTSUPP; new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode); if (!new_ctx) { if (ieee80211_can_create_new_chanctx(local)) { new_ctx = ieee80211_new_chanctx(local, chandef, mode); if (IS_ERR(new_ctx)) return PTR_ERR(new_ctx); } else { if (!curr_ctx || (curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || !list_empty(&curr_ctx->reserved_links)) { /* * Another link already requested this context * for a reservation. Find another one hoping * all links assigned to it will also switch * soon enough. * * TODO: This needs a little more work as some * cases (more than 2 chanctx capable devices) * may fail which could otherwise succeed * provided some channel context juggling was * performed. * * Consider ctx1..3, link1..6, each ctx has 2 * links. link1 and link2 from ctx1 request new * different chandefs starting 2 in-place * reserations with ctx4 and ctx5 replacing * ctx1 and ctx2 respectively. Next link5 and * link6 from ctx3 reserve ctx4. If link3 and * link4 remain on ctx2 as they are then this * fails unless `replace_ctx` from ctx5 is * replaced with ctx3. */ list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE) continue; if (!list_empty(&ctx->reserved_links)) continue; curr_ctx = ctx; break; } } /* * If that's true then all available contexts already * have reservations and cannot be used. */ if (!curr_ctx || (curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || !list_empty(&curr_ctx->reserved_links)) return -EBUSY; new_ctx = ieee80211_alloc_chanctx(local, chandef, mode); if (!new_ctx) return -ENOMEM; new_ctx->replace_ctx = curr_ctx; new_ctx->replace_state = IEEE80211_CHANCTX_REPLACES_OTHER; curr_ctx->replace_ctx = new_ctx; curr_ctx->replace_state = IEEE80211_CHANCTX_WILL_BE_REPLACED; list_add_rcu(&new_ctx->list, &local->chanctx_list); } } list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links); link->reserved_chanctx = new_ctx; link->reserved_chandef = *chandef; link->reserved_radar_required = radar_required; link->reserved_ready = false; return 0; } static void ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: ieee80211_queue_work(&sdata->local->hw, &link->csa_finalize_work); break; case NL80211_IFTYPE_STATION: wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->u.mgd.chswitch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: case NUM_NL80211_IFTYPES: WARN_ON(1); break; } } static void ieee80211_link_update_chandef(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_sub_if_data *vlan; link->conf->chandef = *chandef; if (sdata->vif.type != NL80211_IFTYPE_AP) return; rcu_read_lock(); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { struct ieee80211_bss_conf *vlan_conf; vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]); if (WARN_ON(!vlan_conf)) continue; vlan_conf->chandef = *chandef; } rcu_read_unlock(); } static int ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_vif_chanctx_switch vif_chsw[1] = {}; struct ieee80211_chanctx *old_ctx, *new_ctx; const struct cfg80211_chan_def *chandef; u64 changed = 0; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); new_ctx = link->reserved_chanctx; old_ctx = ieee80211_link_get_chanctx(link); if (WARN_ON(!link->reserved_ready)) return -EBUSY; if (WARN_ON(!new_ctx)) return -EINVAL; if (WARN_ON(!old_ctx)) return -EINVAL; if (WARN_ON(new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)) return -EINVAL; chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, &link->reserved_chandef); if (WARN_ON(!chandef)) return -EINVAL; if (link_conf->chandef.width != link->reserved_chandef.width) changed = BSS_CHANGED_BANDWIDTH; ieee80211_link_update_chandef(link, &link->reserved_chandef); _ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef, link); vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; vif_chsw[0].new_ctx = &new_ctx->conf; vif_chsw[0].link_conf = link->conf; list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = drv_switch_vif_chanctx(local, vif_chsw, 1, CHANCTX_SWMODE_REASSIGN_VIF); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) ieee80211_free_chanctx(local, new_ctx); goto out; } list_move(&link->assigned_chanctx_list, &new_ctx->assigned_links); rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf); if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_link_copy_chanctx_to_vlans(link, false); ieee80211_check_fast_xmit_iface(sdata); if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); if (changed) ieee80211_link_info_change_notify(sdata, link, changed); out: ieee80211_link_chanctx_reservation_complete(link); return err; } static int ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *old_ctx, *new_ctx; const struct cfg80211_chan_def *chandef; int err; old_ctx = ieee80211_link_get_chanctx(link); new_ctx = link->reserved_chanctx; if (WARN_ON(!link->reserved_ready)) return -EINVAL; if (WARN_ON(old_ctx)) return -EINVAL; if (WARN_ON(!new_ctx)) return -EINVAL; if (WARN_ON(new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)) return -EINVAL; chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, &link->reserved_chandef); if (WARN_ON(!chandef)) return -EINVAL; ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef); list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = ieee80211_assign_link_chanctx(link, new_ctx); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) ieee80211_free_chanctx(local, new_ctx); goto out; } out: ieee80211_link_chanctx_reservation_complete(link); return err; } static bool ieee80211_link_has_in_place_reservation(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx *old_ctx, *new_ctx; lockdep_assert_held(&sdata->local->chanctx_mtx); new_ctx = link->reserved_chanctx; old_ctx = ieee80211_link_get_chanctx(link); if (!old_ctx) return false; if (WARN_ON(!new_ctx)) return false; if (old_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED) return false; if (new_ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) return false; return true; } static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local, struct ieee80211_chanctx *new_ctx) { const struct cfg80211_chan_def *chandef; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL); if (WARN_ON(!chandef)) return -EINVAL; local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled; local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); return 0; } static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, int n_vifs) { struct ieee80211_vif_chanctx_switch *vif_chsw; struct ieee80211_link_data *link; struct ieee80211_chanctx *ctx, *old_ctx; int i, err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); vif_chsw = kcalloc(n_vifs, sizeof(vif_chsw[0]), GFP_KERNEL); if (!vif_chsw) return -ENOMEM; i = 0; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (WARN_ON(!ctx->replace_ctx)) { err = -EINVAL; goto out; } list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { if (!ieee80211_link_has_in_place_reservation(link)) continue; old_ctx = ieee80211_link_get_chanctx(link); vif_chsw[i].vif = &link->sdata->vif; vif_chsw[i].old_ctx = &old_ctx->conf; vif_chsw[i].new_ctx = &ctx->conf; vif_chsw[i].link_conf = link->conf; i++; } } err = drv_switch_vif_chanctx(local, vif_chsw, n_vifs, CHANCTX_SWMODE_SWAP_CONTEXTS); out: kfree(vif_chsw); return err; } static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; ieee80211_del_chanctx(local, ctx->replace_ctx); err = ieee80211_add_chanctx(local, ctx); if (err) goto err; } return 0; err: WARN_ON(ieee80211_add_chanctx(local, ctx)); list_for_each_entry_continue_reverse(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; ieee80211_del_chanctx(local, ctx); WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx)); } return err; } static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; struct ieee80211_chanctx *new_ctx = NULL; int err, n_assigned, n_reserved, n_ready; int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); /* * If there are 2 independent pairs of channel contexts performing * cross-switch of their vifs this code will still wait until both are * ready even though it could be possible to switch one before the * other is ready. * * For practical reasons and code simplicity just do a single huge * switch. */ /* * Verify if the reservation is still feasible. * - if it's not then disconnect * - if it is but not all vifs necessary are ready then defer */ list_for_each_entry(ctx, &local->chanctx_list, list) { struct ieee80211_link_data *link; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (WARN_ON(!ctx->replace_ctx)) { err = -EINVAL; goto err; } if (!local->use_chanctx) new_ctx = ctx; n_ctx++; n_assigned = 0; n_reserved = 0; n_ready = 0; list_for_each_entry(link, &ctx->replace_ctx->assigned_links, assigned_chanctx_list) { n_assigned++; if (link->reserved_chanctx) { n_reserved++; if (link->reserved_ready) n_ready++; } } if (n_assigned != n_reserved) { if (n_ready == n_reserved) { wiphy_info(local->hw.wiphy, "channel context reservation cannot be finalized because some interfaces aren't switching\n"); err = -EBUSY; goto err; } return -EAGAIN; } ctx->conf.radar_enabled = false; list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { if (ieee80211_link_has_in_place_reservation(link) && !link->reserved_ready) return -EAGAIN; old_ctx = ieee80211_link_get_chanctx(link); if (old_ctx) { if (old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) n_vifs_switch++; else n_vifs_assign++; } else { n_vifs_ctxless++; } if (link->reserved_radar_required) ctx->conf.radar_enabled = true; } } if (WARN_ON(n_ctx == 0) || WARN_ON(n_vifs_switch == 0 && n_vifs_assign == 0 && n_vifs_ctxless == 0) || WARN_ON(n_ctx > 1 && !local->use_chanctx) || WARN_ON(!new_ctx && !local->use_chanctx)) { err = -EINVAL; goto err; } /* * All necessary vifs are ready. Perform the switch now depending on * reservations and driver capabilities. */ if (local->use_chanctx) { if (n_vifs_switch > 0) { err = ieee80211_chsw_switch_vifs(local, n_vifs_switch); if (err) goto err; } if (n_vifs_assign > 0 || n_vifs_ctxless > 0) { err = ieee80211_chsw_switch_ctxs(local); if (err) goto err; } } else { err = ieee80211_chsw_switch_hwconf(local, new_ctx); if (err) goto err; } /* * Update all structures, values and pointers to point to new channel * context(s). */ list_for_each_entry(ctx, &local->chanctx_list, list) { struct ieee80211_link_data *link, *link_tmp; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (WARN_ON(!ctx->replace_ctx)) { err = -EINVAL; goto err; } list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; u64 changed = 0; if (!ieee80211_link_has_in_place_reservation(link)) continue; rcu_assign_pointer(link_conf->chanctx_conf, &ctx->conf); if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_link_copy_chanctx_to_vlans(link, false); ieee80211_check_fast_xmit_iface(sdata); link->radar_required = link->reserved_radar_required; if (link_conf->chandef.width != link->reserved_chandef.width) changed = BSS_CHANGED_BANDWIDTH; ieee80211_link_update_chandef(link, &link->reserved_chandef); if (changed) ieee80211_link_info_change_notify(sdata, link, changed); ieee80211_recalc_txpower(sdata, false); } ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); ieee80211_recalc_chanctx_min_def(local, ctx, NULL); list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { if (ieee80211_link_get_chanctx(link) != ctx) continue; list_del(&link->reserved_chanctx_list); list_move(&link->assigned_chanctx_list, &ctx->assigned_links); link->reserved_chanctx = NULL; ieee80211_link_chanctx_reservation_complete(link); } /* * This context might have been a dependency for an already * ready re-assign reservation interface that was deferred. Do * not propagate error to the caller though. The in-place * reservation for originally requested interface has already * succeeded at this point. */ list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { if (WARN_ON(ieee80211_link_has_in_place_reservation(link))) continue; if (WARN_ON(link->reserved_chanctx != ctx)) continue; if (!link->reserved_ready) continue; if (ieee80211_link_get_chanctx(link)) err = ieee80211_link_use_reserved_reassign(link); else err = ieee80211_link_use_reserved_assign(link); if (err) { link_info(link, "failed to finalize (re-)assign reservation (err=%d)\n", err); ieee80211_link_unreserve_chanctx(link); cfg80211_stop_iface(local->hw.wiphy, &link->sdata->wdev, GFP_KERNEL); } } } /* * Finally free old contexts */ list_for_each_entry_safe(ctx, ctx_tmp, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; ctx->replace_ctx->replace_ctx = NULL; ctx->replace_ctx->replace_state = IEEE80211_CHANCTX_REPLACE_NONE; list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); } return 0; err: list_for_each_entry(ctx, &local->chanctx_list, list) { struct ieee80211_link_data *link, *link_tmp; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { ieee80211_link_unreserve_chanctx(link); ieee80211_link_chanctx_reservation_complete(link); } } return err; } static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; bool use_reserved_switch = false; lockdep_assert_held(&local->chanctx_mtx); conf = rcu_dereference_protected(link_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) return; ctx = container_of(conf, struct ieee80211_chanctx, conf); if (link->reserved_chanctx) { if (link->reserved_chanctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && ieee80211_chanctx_num_reserved(local, link->reserved_chanctx) > 1) use_reserved_switch = true; ieee80211_link_unreserve_chanctx(link); } ieee80211_assign_link_chanctx(link, NULL); if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); link->radar_required = false; /* Unreserving may ready an in-place reservation. */ if (use_reserved_switch) ieee80211_vif_use_reserved_switch(local); } int ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *ctx; u8 radar_detect_width = 0; int ret; lockdep_assert_held(&local->mtx); if (sdata->vif.active_links && !(sdata->vif.active_links & BIT(link->link_id))) { ieee80211_link_update_chandef(link, chandef); return 0; } mutex_lock(&local->chanctx_mtx); ret = cfg80211_chandef_dfs_required(local->hw.wiphy, chandef, sdata->wdev.iftype); if (ret < 0) goto out; if (ret > 0) radar_detect_width = BIT(chandef->width); link->radar_required = ret; ret = ieee80211_check_combinations(sdata, chandef, mode, radar_detect_width); if (ret < 0) goto out; __ieee80211_link_release_channel(link); ctx = ieee80211_find_chanctx(local, chandef, mode); if (!ctx) ctx = ieee80211_new_chanctx(local, chandef, mode); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto out; } ieee80211_link_update_chandef(link, chandef); ret = ieee80211_assign_link_chanctx(link, ctx); if (ret) { /* if assign fails refcount stays the same */ if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); goto out; } ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); out: if (ret) link->radar_required = false; mutex_unlock(&local->chanctx_mtx); return ret; } int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *new_ctx; struct ieee80211_chanctx *old_ctx; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); new_ctx = link->reserved_chanctx; old_ctx = ieee80211_link_get_chanctx(link); if (WARN_ON(!new_ctx)) return -EINVAL; if (WARN_ON(new_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return -EINVAL; if (WARN_ON(link->reserved_ready)) return -EINVAL; link->reserved_ready = true; if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACE_NONE) { if (old_ctx) return ieee80211_link_use_reserved_reassign(link); return ieee80211_link_use_reserved_assign(link); } /* * In-place reservation may need to be finalized now either if: * a) sdata is taking part in the swapping itself and is the last one * b) sdata has switched with a re-assign reservation to an existing * context readying in-place switching of old_ctx * * In case of (b) do not propagate the error up because the requested * sdata already switched successfully. Just spill an extra warning. * The ieee80211_vif_use_reserved_switch() already stops all necessary * interfaces upon failure. */ if ((old_ctx && old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { err = ieee80211_vif_use_reserved_switch(local); if (err && err != -EAGAIN) { if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) return err; wiphy_info(local->hw.wiphy, "depending in-place reservation failed (err=%d)\n", err); } } return 0; } int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; const struct cfg80211_chan_def *compat; int ret; if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, IEEE80211_CHAN_DISABLED)) return -EINVAL; mutex_lock(&local->chanctx_mtx); if (cfg80211_chandef_identical(chandef, &link_conf->chandef)) { ret = 0; goto out; } if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT || link_conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) { ret = -EINVAL; goto out; } conf = rcu_dereference_protected(link_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) { ret = -EINVAL; goto out; } ctx = container_of(conf, struct ieee80211_chanctx, conf); compat = cfg80211_chandef_compatible(&conf->def, chandef); if (!compat) { ret = -EINVAL; goto out; } switch (ctx->replace_state) { case IEEE80211_CHANCTX_REPLACE_NONE: if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat)) { ret = -EBUSY; goto out; } break; case IEEE80211_CHANCTX_WILL_BE_REPLACED: /* TODO: Perhaps the bandwidth change could be treated as a * reservation itself? */ ret = -EBUSY; goto out; case IEEE80211_CHANCTX_REPLACES_OTHER: /* channel context that is going to replace another channel * context doesn't really exist and shouldn't be assigned * anywhere yet */ WARN_ON(1); break; } ieee80211_link_update_chandef(link, chandef); ieee80211_recalc_chanctx_chantype(local, ctx); *changed |= BSS_CHANGED_BANDWIDTH; ret = 0; out: mutex_unlock(&local->chanctx_mtx); return ret; } void ieee80211_link_release_channel(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; mutex_lock(&sdata->local->chanctx_mtx); if (rcu_access_pointer(link->conf->chanctx_conf)) { lockdep_assert_held(&sdata->local->mtx); __ieee80211_link_release_channel(link); } mutex_unlock(&sdata->local->chanctx_mtx); } void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_bss_conf *ap_conf; struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *ap; struct ieee80211_chanctx_conf *conf; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->bss)) return; ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); mutex_lock(&local->chanctx_mtx); rcu_read_lock(); ap_conf = rcu_dereference(ap->vif.link_conf[link_id]); conf = rcu_dereference_protected(ap_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); rcu_assign_pointer(link_conf->chanctx_conf, conf); rcu_read_unlock(); mutex_unlock(&local->chanctx_mtx); } void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data), void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_chanctx *ctx; rcu_read_lock(); list_for_each_entry_rcu(ctx, &local->chanctx_list, list) if (ctx->driver_present) iter(hw, &ctx->conf, iter_data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _LINUX_IO_URING_H #define _LINUX_IO_URING_H #include <linux/sched.h> #include <linux/xarray.h> #include <uapi/linux/io_uring.h> enum io_uring_cmd_flags { IO_URING_F_COMPLETE_DEFER = 1, IO_URING_F_UNLOCKED = 2, /* the request is executed from poll, it should not be freed */ IO_URING_F_MULTISHOT = 4, /* executed by io-wq */ IO_URING_F_IOWQ = 8, /* int's last bit, sign checks are usually faster than a bit test */ IO_URING_F_NONBLOCK = INT_MIN, /* ctx state flags, for URING_CMD */ IO_URING_F_SQE128 = (1 << 8), IO_URING_F_CQE32 = (1 << 9), IO_URING_F_IOPOLL = (1 << 10), }; struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; union { /* callback to defer completions to task context */ void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); /* used for polled completion */ void *cookie; }; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; } #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned), unsigned flags); /* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)); static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } static inline void io_uring_files_cancel(void) { if (current->io_uring) { io_uring_unreg_ringfd(); __io_uring_cancel(false); } } static inline void io_uring_task_cancel(void) { if (current->io_uring) __io_uring_cancel(true); } static inline void io_uring_free(struct task_struct *tsk) { if (tsk->io_uring) __io_uring_free(tsk); } int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { return -EOPNOTSUPP; } static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t ret2, unsigned issue_flags) { } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; } static inline void io_uring_task_cancel(void) { } static inline void io_uring_files_cancel(void) { } static inline void io_uring_free(struct task_struct *tsk) { } static inline const char *io_uring_get_opcode(u8 opcode) { return ""; } static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { return -EOPNOTSUPP; } #endif #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI sockets. */ #include <linux/compat.h> #include <linux/export.h> #include <linux/utsname.h> #include <linux/sched.h> #include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" static LIST_HEAD(mgmt_chan_list); static DEFINE_MUTEX(mgmt_chan_list_lock); static DEFINE_IDA(sock_cookie_ida); static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ /* Socket info */ #define hci_pi(sk) ((struct hci_pinfo *) sk) struct hci_pinfo { struct bt_sock bt; struct hci_dev *hdev; struct hci_filter filter; __u8 cmsg_mask; unsigned short channel; unsigned long flags; __u32 cookie; char comm[TASK_COMM_LEN]; __u16 mtu; }; static struct hci_dev *hci_hdev_from_sock(struct sock *sk) { struct hci_dev *hdev = hci_pi(sk)->hdev; if (!hdev) return ERR_PTR(-EBADFD); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) return ERR_PTR(-EPIPE); return hdev; } void hci_sock_set_flag(struct sock *sk, int nr) { set_bit(nr, &hci_pi(sk)->flags); } void hci_sock_clear_flag(struct sock *sk, int nr) { clear_bit(nr, &hci_pi(sk)->flags); } int hci_sock_test_flag(struct sock *sk, int nr) { return test_bit(nr, &hci_pi(sk)->flags); } unsigned short hci_sock_get_channel(struct sock *sk) { return hci_pi(sk)->channel; } u32 hci_sock_get_cookie(struct sock *sk) { return hci_pi(sk)->cookie; } static bool hci_sock_gen_cookie(struct sock *sk) { int id = hci_pi(sk)->cookie; if (!id) { id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); if (id < 0) id = 0xffffffff; hci_pi(sk)->cookie = id; get_task_comm(hci_pi(sk)->comm, current); return true; } return false; } static void hci_sock_free_cookie(struct sock *sk) { int id = hci_pi(sk)->cookie; if (id) { hci_pi(sk)->cookie = 0xffffffff; ida_simple_remove(&sock_cookie_ida, id); } } static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); } /* Security filter */ #define HCI_SFLT_MAX_OGF 5 struct hci_sec_filter { __u32 type_mask; __u32 event_mask[2]; __u32 ocf_mask[HCI_SFLT_MAX_OGF + 1][4]; }; static const struct hci_sec_filter hci_sec_filter = { /* Packet types */ 0x10, /* Events */ { 0x1000d9fe, 0x0000b00c }, /* Commands */ { { 0x0 }, /* OGF_LINK_CTL */ { 0xbe000006, 0x00000001, 0x00000000, 0x00 }, /* OGF_LINK_POLICY */ { 0x00005200, 0x00000000, 0x00000000, 0x00 }, /* OGF_HOST_CTL */ { 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 }, /* OGF_INFO_PARAM */ { 0x000002be, 0x00000000, 0x00000000, 0x00 }, /* OGF_STATUS_PARAM */ { 0x000000ea, 0x00000000, 0x00000000, 0x00 } } }; static struct bt_sock_list hci_sk_list = { .lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock) }; static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb) { struct hci_filter *flt; int flt_type, flt_event; /* Apply filter */ flt = &hci_pi(sk)->filter; flt_type = hci_skb_pkt_type(skb) & HCI_FLT_TYPE_BITS; if (!test_bit(flt_type, &flt->type_mask)) return true; /* Extra filter for event packets only */ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT) return false; flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); if (!hci_test_bit(flt_event, &flt->event_mask)) return true; /* Check filter only when opcode is set */ if (!flt->opcode) return false; if (flt_event == HCI_EV_CMD_COMPLETE && flt->opcode != get_unaligned((__le16 *)(skb->data + 3))) return true; if (flt_event == HCI_EV_CMD_STATUS && flt->opcode != get_unaligned((__le16 *)(skb->data + 4))) return true; return false; } /* Send frame to RAW socket */ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) { struct sock *sk; struct sk_buff *skb_copy = NULL; BT_DBG("hdev %p len %d", hdev, skb->len); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev) continue; /* Don't send frame to the socket it came from */ if (skb->sk == sk) continue; if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) { if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; if (is_filtered_packet(sk, skb)) continue; } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { if (!bt_cb(skb)->incoming) continue; if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; } else { /* Don't send frame to other channel types */ continue; } if (!skb_copy) { /* Create a private copy with headroom */ skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true); if (!skb_copy) continue; /* Put type byte before the data */ memcpy(skb_push(skb_copy, 1), &hci_skb_pkt_type(skb), 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&hci_sk_list.lock); kfree_skb(skb_copy); } static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb) { struct scm_creds *creds; if (!sk || WARN_ON(!skb)) return; creds = &bt_cb(skb)->creds; /* Check if peer credentials is set */ if (!sk->sk_peer_pid) { /* Check if parent peer credentials is set */ if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid) sk = bt_sk(sk)->parent; else return; } /* Check if scm_creds already set */ if (creds->pid == pid_vnr(sk->sk_peer_pid)) return; memset(creds, 0, sizeof(*creds)); creds->pid = pid_vnr(sk->sk_peer_pid); if (sk->sk_peer_cred) { creds->uid = sk->sk_peer_cred->uid; creds->gid = sk->sk_peer_cred->gid; } } static struct sk_buff *hci_skb_clone(struct sk_buff *skb) { struct sk_buff *nskb; if (!skb) return NULL; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return NULL; hci_sock_copy_creds(skb->sk, nskb); return nskb; } /* Send frame to sockets with specific channel */ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { struct sock *sk; BT_DBG("channel %u len %d", channel, skb->len); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; /* Ignore socket without the flag set */ if (!hci_sock_test_flag(sk, flag)) continue; /* Skip the original socket */ if (sk == skip_sk) continue; if (sk->sk_state != BT_BOUND) continue; if (hci_pi(sk)->channel != channel) continue; nskb = hci_skb_clone(skb); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } } void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { read_lock(&hci_sk_list.lock); __hci_send_to_channel(channel, skb, flag, skip_sk); read_unlock(&hci_sk_list.lock); } /* Send frame to monitor socket */ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) { struct sk_buff *skb_copy = NULL; struct hci_mon_hdr *hdr; __le16 opcode; if (!atomic_read(&monitor_promisc)) return; BT_DBG("hdev %p len %d", hdev, skb->len); switch (hci_skb_pkt_type(skb)) { case HCI_COMMAND_PKT: opcode = cpu_to_le16(HCI_MON_COMMAND_PKT); break; case HCI_EVENT_PKT: opcode = cpu_to_le16(HCI_MON_EVENT_PKT); break; case HCI_ACLDATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_ACL_RX_PKT); else opcode = cpu_to_le16(HCI_MON_ACL_TX_PKT); break; case HCI_SCODATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_SCO_RX_PKT); else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; case HCI_ISODATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT); else opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; default: return; } /* Create a private copy with headroom */ skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE, GFP_ATOMIC, true); if (!skb_copy) return; hci_sock_copy_creds(skb->sk, skb_copy); /* Put header before the data */ hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb_copy); } void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, void *data, u16 data_len, ktime_t tstamp, int flag, struct sock *skip_sk) { struct sock *sk; __le16 index; if (hdev) index = cpu_to_le16(hdev->id); else index = cpu_to_le16(MGMT_INDEX_NONE); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct hci_mon_hdr *hdr; struct sk_buff *skb; if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) continue; /* Ignore socket without the flag set */ if (!hci_sock_test_flag(sk, flag)) continue; /* Skip the original socket */ if (sk == skip_sk) continue; skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC); if (!skb) continue; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(event, skb_put(skb, 2)); if (data) skb_put_data(skb, data, data_len); skb->tstamp = tstamp; hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } read_unlock(&hci_sk_list.lock); } static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; struct hci_mon_new_index *ni; struct hci_mon_index_info *ii; struct sk_buff *skb; __le16 opcode; switch (event) { case HCI_DEV_REG: skb = bt_skb_alloc(HCI_MON_NEW_INDEX_SIZE, GFP_ATOMIC); if (!skb) return NULL; ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE); ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); memcpy(ni->name, hdev->name, 8); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; case HCI_DEV_UNREG: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; case HCI_DEV_SETUP: if (hdev->manufacturer == 0xffff) return NULL; fallthrough; case HCI_DEV_UP: skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); if (!skb) return NULL; ii = skb_put(skb, HCI_MON_INDEX_INFO_SIZE); bacpy(&ii->bdaddr, &hdev->bdaddr); ii->manufacturer = cpu_to_le16(hdev->manufacturer); opcode = cpu_to_le16(HCI_MON_INDEX_INFO); break; case HCI_DEV_OPEN: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_OPEN_INDEX); break; case HCI_DEV_CLOSE: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX); break; default: return NULL; } __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; u16 format; u8 ver[3]; u32 flags; /* No message needed when cookie is not present */ if (!hci_pi(sk)->cookie) return NULL; switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: format = 0x0000; ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; case HCI_CHANNEL_USER: format = 0x0001; ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; case HCI_CHANNEL_CONTROL: format = 0x0002; mgmt_fill_version_info(ver); break; default: /* No message for unsupported format */ return NULL; } skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(format, skb_put(skb, 2)); skb_put_data(skb, ver, sizeof(ver)); put_unaligned_le32(flags, skb_put(skb, 4)); skb_put_u8(skb, TASK_COMM_LEN); skb_put_data(skb, hci_pi(sk)->comm, TASK_COMM_LEN); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); if (hci_pi(sk)->hdev) hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); else hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; /* No message needed when cookie is not present */ if (!hci_pi(sk)->cookie) return NULL; switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: break; default: /* No message for unsupported format */ return NULL; } skb = bt_skb_alloc(4, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); if (hci_pi(sk)->hdev) hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); else hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, u16 opcode, u16 len, const void *buf) { struct hci_mon_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(6 + len, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); if (buf) skb_put_data(skb, buf, len); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { size_t len; struct hci_mon_hdr *hdr; struct sk_buff *skb; va_list args; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args); va_end(args); skb = bt_skb_alloc(len + 1, GFP_ATOMIC); if (!skb) return; hci_sock_copy_creds(sk, skb); va_start(args, fmt); vsprintf(skb_put(skb, len), fmt, args); *(u8 *)skb_put(skb, 1) = 0; va_end(args); __net_timestamp(skb); hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_SYSTEM_NOTE); hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); } static void send_monitor_replay(struct sock *sk) { struct hci_dev *hdev; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { struct sk_buff *skb; skb = create_monitor_event(hdev, HCI_DEV_REG); if (!skb) continue; if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) continue; skb = create_monitor_event(hdev, HCI_DEV_OPEN); if (!skb) continue; if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); if (test_bit(HCI_UP, &hdev->flags)) skb = create_monitor_event(hdev, HCI_DEV_UP); else if (hci_dev_test_flag(hdev, HCI_SETUP)) skb = create_monitor_event(hdev, HCI_DEV_SETUP); else skb = NULL; if (skb) { if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); } } read_unlock(&hci_dev_list_lock); } static void send_monitor_control_replay(struct sock *mon_sk) { struct sock *sk; read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *skb; skb = create_monitor_ctrl_open(sk); if (!skb) continue; if (sock_queue_rcv_skb(mon_sk, skb)) kfree_skb(skb); } read_unlock(&hci_sk_list.lock); } /* Generate internal stack event */ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) { struct hci_event_hdr *hdr; struct hci_ev_stack_internal *ev; struct sk_buff *skb; skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC); if (!skb) return; hdr = skb_put(skb, HCI_EVENT_HDR_SIZE); hdr->evt = HCI_EV_STACK_INTERNAL; hdr->plen = sizeof(*ev) + dlen; ev = skb_put(skb, sizeof(*ev) + dlen); ev->type = type; memcpy(ev->data, data, dlen); bt_cb(skb)->incoming = 1; __net_timestamp(skb); hci_skb_pkt_type(skb) = HCI_EVENT_PKT; hci_send_to_sock(hdev, skb); kfree_skb(skb); } void hci_sock_dev_event(struct hci_dev *hdev, int event) { BT_DBG("hdev %s event %d", hdev->name, event); if (atomic_read(&monitor_promisc)) { struct sk_buff *skb; /* Send event to monitor */ skb = create_monitor_event(hdev, event); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } if (event <= HCI_DEV_DOWN) { struct hci_ev_si_device ev; /* Send event to sockets */ ev.event = event; ev.dev_id = hdev->id; hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); } if (event == HCI_DEV_UNREG) { struct sock *sk; /* Wake up sockets using this dead device */ read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { if (hci_pi(sk)->hdev == hdev) { sk->sk_err = EPIPE; sk->sk_state_change(sk); } } read_unlock(&hci_sk_list.lock); } } static struct hci_mgmt_chan *__hci_mgmt_chan_find(unsigned short channel) { struct hci_mgmt_chan *c; list_for_each_entry(c, &mgmt_chan_list, list) { if (c->channel == channel) return c; } return NULL; } static struct hci_mgmt_chan *hci_mgmt_chan_find(unsigned short channel) { struct hci_mgmt_chan *c; mutex_lock(&mgmt_chan_list_lock); c = __hci_mgmt_chan_find(channel); mutex_unlock(&mgmt_chan_list_lock); return c; } int hci_mgmt_chan_register(struct hci_mgmt_chan *c) { if (c->channel < HCI_CHANNEL_CONTROL) return -EINVAL; mutex_lock(&mgmt_chan_list_lock); if (__hci_mgmt_chan_find(c->channel)) { mutex_unlock(&mgmt_chan_list_lock); return -EALREADY; } list_add_tail(&c->list, &mgmt_chan_list); mutex_unlock(&mgmt_chan_list_lock); return 0; } EXPORT_SYMBOL(hci_mgmt_chan_register); void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c) { mutex_lock(&mgmt_chan_list_lock); list_del(&c->list); mutex_unlock(&mgmt_chan_list_lock); } EXPORT_SYMBOL(hci_mgmt_chan_unregister); static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; struct sk_buff *skb; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; lock_sock(sk); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); break; case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: /* Send event to monitor */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } hci_sock_free_cookie(sk); break; } bt_sock_unlink(&hci_sk_list, sk); hdev = hci_pi(sk)->hdev; if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER && !hci_dev_test_flag(hdev, HCI_UNREGISTER)) { /* When releasing a user channel exclusive access, * call hci_dev_do_close directly instead of calling * hci_dev_close to ensure the exclusive access will * be released and the controller brought back down. * * The checking of HCI_AUTO_OFF is not needed in this * case since it will have been cleared already when * opening the user channel. * * Make sure to also check that we haven't already * unregistered since all the cleanup will have already * been complete and hdev will get released when we put * below. */ hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); } atomic_dec(&hdev->promisc); hci_dev_put(hdev); } sock_orphan(sk); release_sock(sk); sock_put(sk); return 0; } static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) return -EFAULT; hci_dev_lock(hdev); err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) return -EFAULT; hci_dev_lock(hdev); err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } /* Ioctls that require bound socket */ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { struct hci_dev *hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) return PTR_ERR(hdev); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; if (hdev->dev_type != HCI_PRIMARY) return -EOPNOTSUPP; switch (cmd) { case HCISETRAW: if (!capable(CAP_NET_ADMIN)) return -EPERM; return -EOPNOTSUPP; case HCIGETCONNINFO: return hci_get_conn_info(hdev, (void __user *)arg); case HCIGETAUTHINFO: return hci_get_auth_info(hdev, (void __user *)arg); case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_sock_reject_list_add(hdev, (void __user *)arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_sock_reject_list_del(hdev, (void __user *)arg); } return -ENOIOCTLCMD; } static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; int err; BT_DBG("cmd %x arg %lx", cmd, arg); /* Make sure the cmd is valid before doing anything */ switch (cmd) { case HCIGETDEVLIST: case HCIGETDEVINFO: case HCIGETCONNLIST: case HCIDEVUP: case HCIDEVDOWN: case HCIDEVRESET: case HCIDEVRESTAT: case HCISETSCAN: case HCISETAUTH: case HCISETENCRYPT: case HCISETPTYPE: case HCISETLINKPOL: case HCISETLINKMODE: case HCISETACLMTU: case HCISETSCOMTU: case HCIINQUIRY: case HCISETRAW: case HCIGETCONNINFO: case HCIGETAUTHINFO: case HCIBLOCKADDR: case HCIUNBLOCKADDR: break; default: return -ENOIOCTLCMD; } lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } /* When calling an ioctl on an unbound raw socket, then ensure * that the monitor gets informed. Ensure that the resulting event * is only send once by checking if the cookie exists or not. The * socket cookie will be only ever generated once for the lifetime * of a given socket. */ if (hci_sock_gen_cookie(sk)) { struct sk_buff *skb; /* Perform careful checks before setting the HCI_SOCK_TRUSTED * flag. Make sure that not only the current task but also * the socket opener has the required capability, since * privileged programs can be tricked into making ioctl calls * on HCI sockets, and the socket should not be marked as * trusted simply because the ioctl caller is privileged. */ if (sk_capable(sk, CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } release_sock(sk); switch (cmd) { case HCIGETDEVLIST: return hci_get_dev_list(argp); case HCIGETDEVINFO: return hci_get_dev_info(argp); case HCIGETCONNLIST: return hci_get_conn_list(argp); case HCIDEVUP: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_open(arg); case HCIDEVDOWN: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_close(arg); case HCIDEVRESET: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_reset(arg); case HCIDEVRESTAT: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_reset_stat(arg); case HCISETSCAN: case HCISETAUTH: case HCISETENCRYPT: case HCISETPTYPE: case HCISETLINKPOL: case HCISETLINKMODE: case HCISETACLMTU: case HCISETSCOMTU: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_cmd(cmd, argp); case HCIINQUIRY: return hci_inquiry(argp); } lock_sock(sk); err = hci_sock_bound_ioctl(sk, cmd, arg); done: release_sock(sk); return err; } #ifdef CONFIG_COMPAT static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { switch (cmd) { case HCIDEVUP: case HCIDEVDOWN: case HCIDEVRESET: case HCIDEVRESTAT: return hci_sock_ioctl(sock, cmd, arg); } return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_hci haddr; struct sock *sk = sock->sk; struct hci_dev *hdev = NULL; struct sk_buff *skb; int len, err = 0; BT_DBG("sock %p sk %p", sock, sk); if (!addr) return -EINVAL; memset(&haddr, 0, sizeof(haddr)); len = min_t(unsigned int, sizeof(haddr), addr_len); memcpy(&haddr, addr, len); if (haddr.hci_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); /* Allow detaching from dead device and attaching to alive device, if * the caller wants to re-bind (instead of close) this socket in * response to hci_sock_dev_event(HCI_DEV_UNREG) notification. */ hdev = hci_pi(sk)->hdev; if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) { hci_pi(sk)->hdev = NULL; sk->sk_state = BT_OPEN; hci_dev_put(hdev); } hdev = NULL; if (sk->sk_state == BT_BOUND) { err = -EALREADY; goto done; } switch (haddr.hci_channel) { case HCI_CHANNEL_RAW: if (hci_pi(sk)->hdev) { err = -EALREADY; goto done; } if (haddr.hci_dev != HCI_DEV_NONE) { hdev = hci_dev_get(haddr.hci_dev); if (!hdev) { err = -ENODEV; goto done; } atomic_inc(&hdev->promisc); } hci_pi(sk)->channel = haddr.hci_channel; if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * then there has been already an ioctl issued against * an unbound socket and with that triggered an open * notification. Send a close notification first to * allow the state transition to bounded. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } break; case HCI_CHANNEL_USER: if (hci_pi(sk)->hdev) { err = -EALREADY; goto done; } if (haddr.hci_dev == HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_ADMIN)) { err = -EPERM; goto done; } hdev = hci_dev_get(haddr.hci_dev); if (!hdev) { err = -ENODEV; goto done; } if (test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && test_bit(HCI_UP, &hdev->flags))) { err = -EBUSY; hci_dev_put(hdev); goto done; } if (hci_dev_test_and_set_flag(hdev, HCI_USER_CHANNEL)) { err = -EUSERS; hci_dev_put(hdev); goto done; } mgmt_index_removed(hdev); err = hci_dev_open(hdev->id); if (err) { if (err == -EALREADY) { /* In case the transport is already up and * running, clear the error here. * * This can happen when opening a user * channel and HCI_AUTO_OFF grace period * is still active. */ err = 0; } else { hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); hci_dev_put(hdev); goto done; } } hci_pi(sk)->channel = haddr.hci_channel; if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * this socket will transition from a raw socket into * a user channel socket. For a clean transition, send * the close notification first. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } /* The user channel is restricted to CAP_NET_ADMIN * capabilities and with that implicitly trusted. */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } atomic_inc(&hdev->promisc); break; case HCI_CHANNEL_MONITOR: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_RAW)) { err = -EPERM; goto done; } hci_pi(sk)->channel = haddr.hci_channel; /* The monitor interface is restricted to CAP_NET_RAW * capabilities and with that implicitly trusted. */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); send_monitor_note(sk, "Linux version %s (%s)", init_utsname()->release, init_utsname()->machine); send_monitor_note(sk, "Bluetooth subsystem version %u.%u", BT_SUBSYS_VERSION, BT_SUBSYS_REVISION); send_monitor_replay(sk); send_monitor_control_replay(sk); atomic_inc(&monitor_promisc); break; case HCI_CHANNEL_LOGGING: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_ADMIN)) { err = -EPERM; goto done; } hci_pi(sk)->channel = haddr.hci_channel; break; default: if (!hci_mgmt_chan_find(haddr.hci_channel)) { err = -EINVAL; goto done; } if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } /* Users with CAP_NET_ADMIN capabilities are allowed * access to all management commands and events. For * untrusted users the interface is restricted and * also only untrusted events are sent. */ if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->channel = haddr.hci_channel; /* At the moment the index and unconfigured index events * are enabled unconditionally. Setting them on each * socket when binding keeps this functionality. They * however might be cleared later and then sending of these * events will be disabled, but that is then intentional. * * This also enables generic events that are safe to be * received by untrusted users. Example for such events * are changes to settings, class of device, name etc. */ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been * assigned, this socket will transition from * a raw socket into a control socket. To * allow for a clean transition, send the * close notification first. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); } break; } /* Default MTU to HCI_MAX_FRAME_SIZE if not set */ if (!hci_pi(sk)->mtu) hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE; sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr; struct sock *sk = sock->sk; struct hci_dev *hdev; int err = 0; BT_DBG("sock %p sk %p", sock, sk); if (peer) return -EOPNOTSUPP; lock_sock(sk); hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); goto done; } haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; haddr->hci_channel= hci_pi(sk)->channel; err = sizeof(*haddr); done: release_sock(sk); return err; } static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { __u8 mask = hci_pi(sk)->cmsg_mask; if (mask & HCI_CMSG_DIR) { int incoming = bt_cb(skb)->incoming; put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); } if (mask & HCI_CMSG_TSTAMP) { #ifdef CONFIG_COMPAT struct old_timeval32 ctv; #endif struct __kernel_old_timeval tv; void *data; int len; skb_get_timestamp(skb, &tv); data = &tv; len = sizeof(tv); #ifdef CONFIG_COMPAT if (!COMPAT_USE_64BIT_TIME && (msg->msg_flags & MSG_CMSG_COMPAT)) { ctv.tv_sec = tv.tv_sec; ctv.tv_usec = tv.tv_usec; data = &ctv; len = sizeof(ctv); } #endif put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data); } } static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; unsigned int skblen; BT_DBG("sock %p, sk %p", sock, sk); if (flags & MSG_OOB) return -EOPNOTSUPP; if (hci_pi(sk)->channel == HCI_CHANNEL_LOGGING) return -EOPNOTSUPP; if (sk->sk_state == BT_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: hci_sock_cmsg(sk, msg, skb); break; case HCI_CHANNEL_USER: case HCI_CHANNEL_MONITOR: sock_recv_timestamp(msg, sk, skb); break; default: if (hci_mgmt_chan_find(hci_pi(sk)->channel)) sock_recv_timestamp(msg, sk, skb); break; } memset(&scm, 0, sizeof(scm)); scm.creds = bt_cb(skb)->creds; skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; scm_recv(sock, msg, &scm, flags); return err ? : copied; } static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, struct sk_buff *skb) { u8 *cp; struct mgmt_hdr *hdr; u16 opcode, index, len; struct hci_dev *hdev = NULL; const struct hci_mgmt_handler *handler; bool var_len, no_hdev; int err; BT_DBG("got %d bytes", skb->len); if (skb->len < sizeof(*hdr)) return -EINVAL; hdr = (void *)skb->data; opcode = __le16_to_cpu(hdr->opcode); index = __le16_to_cpu(hdr->index); len = __le16_to_cpu(hdr->len); if (len != skb->len - sizeof(*hdr)) { err = -EINVAL; goto done; } if (chan->channel == HCI_CHANNEL_CONTROL) { struct sk_buff *cmd; /* Send event to monitor */ cmd = create_monitor_ctrl_command(sk, index, opcode, len, skb->data + sizeof(*hdr)); if (cmd) { hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd, HCI_SOCK_TRUSTED, NULL); kfree_skb(cmd); } } if (opcode >= chan->handler_count || chan->handlers[opcode].func == NULL) { BT_DBG("Unknown op %u", opcode); err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_UNKNOWN_COMMAND); goto done; } handler = &chan->handlers[opcode]; if (!hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) && !(handler->flags & HCI_MGMT_UNTRUSTED)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_PERMISSION_DENIED); goto done; } if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); if (!hdev) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } if (hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !(handler->flags & HCI_MGMT_UNCONFIGURED)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } } if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) { no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); if (no_hdev != !hdev) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } } var_len = (handler->flags & HCI_MGMT_VAR_LEN); if ((var_len && len < handler->data_len) || (!var_len && len != handler->data_len)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_PARAMS); goto done; } if (hdev && chan->hdev_init) chan->hdev_init(sk, hdev); cp = skb->data + sizeof(*hdr); err = handler->func(sk, hdev, cp, len); if (err < 0) goto done; err = skb->len; done: if (hdev) hci_dev_put(hdev); return err; } static int hci_logging_frame(struct sock *sk, struct sk_buff *skb, unsigned int flags) { struct hci_mon_hdr *hdr; struct hci_dev *hdev; u16 index; int err; /* The logging frame consists at minimum of the standard header, * the priority byte, the ident length byte and at least one string * terminator NUL byte. Anything shorter are invalid packets. */ if (skb->len < sizeof(*hdr) + 3) return -EINVAL; hdr = (void *)skb->data; if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr)) return -EINVAL; if (__le16_to_cpu(hdr->opcode) == 0x0000) { __u8 priority = skb->data[sizeof(*hdr)]; __u8 ident_len = skb->data[sizeof(*hdr) + 1]; /* Only the priorities 0-7 are valid and with that any other * value results in an invalid packet. * * The priority byte is followed by an ident length byte and * the NUL terminated ident string. Check that the ident * length is not overflowing the packet and also that the * ident string itself is NUL terminated. In case the ident * length is zero, the length value actually doubles as NUL * terminator identifier. * * The message follows the ident string (if present) and * must be NUL terminated. Otherwise it is not a valid packet. */ if (priority > 7 || skb->data[skb->len - 1] != 0x00 || ident_len > skb->len - sizeof(*hdr) - 3 || skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) return -EINVAL; } else { return -EINVAL; } index = __le16_to_cpu(hdr->index); if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); if (!hdev) return -ENODEV; } else { hdev = NULL; } hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); err = skb->len; if (hdev) hci_dev_put(hdev); return err; } static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct hci_mgmt_chan *chan; struct hci_dev *hdev; struct sk_buff *skb; int err; const unsigned int flags = msg->msg_flags; BT_DBG("sock %p sk %p", sock, sk); if (flags & MSG_OOB) return -EOPNOTSUPP; if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (len < 4 || len > hci_pi(sk)->mtu) return -EINVAL; skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); lock_sock(sk); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: break; case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; goto drop; case HCI_CHANNEL_LOGGING: err = hci_logging_frame(sk, skb, flags); goto drop; default: mutex_lock(&mgmt_chan_list_lock); chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); if (chan) err = hci_mgmt_cmd(chan, sk, skb); else err = -EINVAL; mutex_unlock(&mgmt_chan_list_lock); goto drop; } hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); goto drop; } if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto drop; } hci_skb_pkt_type(skb) = skb->data[0]; skb_pull(skb, 1); if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { /* No permission check is needed for user channel * since that gets enforced when binding the socket. * * However check that the packet type is valid. */ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else if (hci_skb_pkt_type(skb) == HCI_COMMAND_PKT) { u16 opcode = get_unaligned_le16(skb->data); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); if (((ogf > HCI_SFLT_MAX_OGF) || !hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) && !capable(CAP_NET_RAW)) { err = -EPERM; goto drop; } /* Since the opcode has already been extracted here, store * a copy of the value for later use by the drivers. */ hci_skb_opcode(skb) = opcode; if (ogf == 0x3f) { skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else { /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } } else { if (!capable(CAP_NET_RAW)) { err = -EPERM; goto drop; } if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } err = len; done: release_sock(sk); return err; drop: kfree_skb(skb); goto done; } static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int len) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; int err = 0, opt = 0; BT_DBG("sk %p, opt %d", sk, optname); lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } switch (optname) { case HCI_DATA_DIR: if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; else hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR; break; case HCI_TIME_STAMP: if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; else hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP; break; case HCI_FILTER: { struct hci_filter *f = &hci_pi(sk)->filter; uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); uf.event_mask[1] = *((u32 *) f->event_mask + 1); } len = min_t(unsigned int, len, sizeof(uf)); if (copy_from_sockptr(&uf, optval, len)) { err = -EFAULT; break; } if (!capable(CAP_NET_RAW)) { uf.type_mask &= hci_sec_filter.type_mask; uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0); uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1); } { struct hci_filter *f = &hci_pi(sk)->filter; f->type_mask = uf.type_mask; f->opcode = uf.opcode; *((u32 *) f->event_mask + 0) = uf.event_mask[0]; *((u32 *) f->event_mask + 1) = uf.event_mask[1]; } break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int len) { struct sock *sk = sock->sk; int err = 0; u16 opt; BT_DBG("sk %p, opt %d", sk, optname); if (level == SOL_HCI) return hci_sock_setsockopt_old(sock, level, optname, optval, len); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SNDMTU: case BT_RCVMTU: switch (hci_pi(sk)->channel) { /* Don't allow changing MTU for channels that are meant for HCI * traffic only. */ case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: err = -ENOPROTOOPT; goto done; } if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } hci_pi(sk)->mtu = opt; break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct hci_ufilter uf; struct sock *sk = sock->sk; int len, opt, err = 0; BT_DBG("sk %p, opt %d", sk, optname); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } switch (optname) { case HCI_DATA_DIR: if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR) opt = 1; else opt = 0; if (put_user(opt, optval)) err = -EFAULT; break; case HCI_TIME_STAMP: if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) err = -EFAULT; break; case HCI_FILTER: { struct hci_filter *f = &hci_pi(sk)->filter; memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); uf.event_mask[1] = *((u32 *) f->event_mask + 1); } len = min_t(unsigned int, len, sizeof(uf)); if (copy_to_user(optval, &uf, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p, opt %d", sk, optname); if (level == SOL_HCI) return hci_sock_getsockopt_old(sock, level, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SNDMTU: case BT_RCVMTU: if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static void hci_sock_destruct(struct sock *sk) { mgmt_cleanup(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, .bind = hci_sock_bind, .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hci_sock_compat_ioctl, #endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, .getsockopt = hci_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto hci_sk_proto = { .name = "HCI", .owner = THIS_MODULE, .obj_size = sizeof(struct hci_pinfo) }; static int hci_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sock->ops = &hci_sock_ops; sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sock->state = SS_UNCONNECTED; sk->sk_destruct = hci_sock_destruct; bt_sock_link(&hci_sk_list, sk); return 0; } static const struct net_proto_family hci_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = hci_sock_create, }; int __init hci_sock_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_hci) > sizeof(struct sockaddr)); err = proto_register(&hci_sk_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops); if (err < 0) { BT_ERR("HCI socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "hci", &hci_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HCI proc file"); bt_sock_unregister(BTPROTO_HCI); goto error; } BT_INFO("HCI socket layer initialized"); return 0; error: proto_unregister(&hci_sk_proto); return err; } void hci_sock_cleanup(void) { bt_procfs_cleanup(&init_net, "hci"); bt_sock_unregister(BTPROTO_HCI); proto_unregister(&hci_sk_proto); } |
150 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ICMPV6_H #define _LINUX_ICMPV6_H #include <linux/skbuff.h> #include <linux/ipv6.h> #include <uapi/linux/icmpv6.h> static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) { return (struct icmp6hdr *)skb_transport_header(skb); } #include <linux/netdevice.h> #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } #else extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { __icmpv6_send(skb, type, code, info, IP6CB(skb)); } int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; __icmpv6_send(skb_in, type, code, info, &parm); } #endif #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } static inline void icmpv6_ndo_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } #endif extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); extern void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason); struct flowi6; struct in6_addr; void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_param_prob_reason(skb, code, pos, SKB_DROP_REASON_NOT_SPECIFIED); } static inline bool icmpv6_is_err(int type) { switch (type) { case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: return true; } return false; } #endif |
73 73 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2021-2023 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <net/mac80211.h> #include <asm/unaligned.h> #include "ieee80211_i.h" #include "rate.h" #include "mesh.h" #include "led.h" #include "wme.h" void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tmp; skb->pkt_type = IEEE80211_TX_STATUS_MSG; skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? &local->skb_queue : &local->skb_queue_unreliable, skb); tmp = skb_queue_len(&local->skb_queue) + skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. */ memset(&info->control, 0, sizeof(info->control)); info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->deflink.status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set * but later frames might time out so it might have to be * clear again ... It's all rather unlikely (this frame * should time out first, right?) but let's not confuse * peers unnecessarily. */ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); int tid = *p & IEEE80211_QOS_CTL_TID_MASK; /* * Clear EOSP if set, this could happen e.g. * if an absence period (us being a P2P GO) * shortens the SP. */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: * * (1) STA sends frame indicating it will go to sleep and does so * (2) hardware/firmware adds STA to filter list, passes frame up * (3) hardware/firmware processes TX fifo and suppresses a frame * (4) we get TX status before having processed the frame and * knowing that the STA has gone to sleep. * * This is actually quite unlikely even when both those events are * processed from interrupts coming in quickly after one another or * even at the same time because we queue both TX status events and * RX frames to be processed by a tasklet and process them in the * same order that they were received or TX status last. Hence, there * is no race as long as the frame RX is processed before the next TX * status, which drivers can ensure, see below. * * Note that this can only happen if the hardware or firmware can * actually add STAs to the filter list, if this is done by the * driver in response to set_tim() (which will only reduce the race * this whole filtering tries to solve, not completely solve it) * this situation cannot happen. * * To completely solve this race drivers need to make sure that they * (a) don't mix the irq-safe/not irq-safe TX status/RX processing * functions and * (b) always process RX events before TX status events if ordering * can be unknown, for example with different interrupt status * bits. * (c) if PS mode transitions are manual (i.e. the flag * %IEEE80211_HW_AP_LINK_PS is set), always process PS state * changes before calling TX status events if ordering can be * unknown. */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) { skb_queue_tail(&sta->tx_filtered[ac], skb); sta_info_recalc_tim(sta); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); return; } if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { /* Software retry the packet once */ info->flags |= IEEE80211_TX_INTFL_RETRIED; ieee80211_add_pending_skb(local, skb); return; } ps_dbg_ratelimited(sta->sdata, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || !tid_tx->bar_pending) return; tid_tx->bar_pending = false; ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn); } static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *) skb->data; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); u16 tid = qc[0] & 0xf; ieee80211_check_pending_bar(sta, hdr->addr1, tid); } if (ieee80211_is_action(mgmt->frame_control) && !ieee80211_has_protected(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_HT && mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS && ieee80211_sdata_running(sdata)) { enum ieee80211_smps_mode smps_mode; switch (mgmt->u.action.u.ht_smps.smps_control) { case WLAN_HT_SMPS_CONTROL_DYNAMIC: smps_mode = IEEE80211_SMPS_DYNAMIC; break; case WLAN_HT_SMPS_CONTROL_STATIC: smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_SMPS_CONTROL_DISABLED: default: /* shouldn't happen since we don't send that */ smps_mode = IEEE80211_SMPS_OFF; break; } if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* * This update looks racy, but isn't -- if we come * here we've definitely got a station that we're * talking to, and on a managed interface that can * only be the AP. And the only other place updating * this variable in managed mode is before association. */ sdata->deflink.smps_mode = smps_mode; ieee80211_queue_work(&local->hw, &sdata->recalc_smps); } } } static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx) return; tid_tx->failed_bar_ssn = ssn; tid_tx->bar_pending = true; } static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, struct ieee80211_tx_status *status) { struct ieee80211_rate_status *status_rate = NULL; int len = sizeof(struct ieee80211_radiotap_header); if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; /* IEEE80211_RADIOTAP_RATE rate */ if (status_rate && !(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) len += 2; else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ len += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ len += 1; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (status_rate) { if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS) len += 3; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS) len = ALIGN(len, 2) + 12; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS) len = ALIGN(len, 2) + 12; } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) len = ALIGN(len, 2) + 12; } return len; } static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, int rtap_len, int shift, struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; struct ieee80211_rate_status *status_rate = NULL; unsigned char *pos; u16 legacy_rate = 0; u16 txflags; if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; rthdr = skb_push(skb, rtap_len); memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* * XXX: Once radiotap gets the bitmap reset thing the vendor * extensions proposal contains, we can actually report * the whole set of tries we did. */ /* IEEE80211_RADIOTAP_RATE */ if (status_rate) { if (!(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) legacy_rate = status_rate->rate_idx.legacy; } else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[info->band]; legacy_rate = sband->bitrates[info->status.rates[0].idx].bitrate; } if (legacy_rate) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift)); /* padding for tx flags */ pos += 2; } /* IEEE80211_RADIOTAP_TX_FLAGS */ txflags = 0; if (!(info->flags & IEEE80211_TX_STAT_ACK) && !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); pos += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ /* for now report the total retry_count */ *pos = retry_count; pos++; if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS)) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (status_rate->rate_idx.bw == RATE_INFO_BW_40) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; pos[2] = status_rate->rate_idx.mcs; pos += 3; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS)) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_160: *pos = 11; break; case RATE_INFO_BW_80: *pos = 4; break; case RATE_INFO_BW_40: *pos = 1; break; default: *pos = 0; break; } pos++; /* u8 mcs_nss[4] */ *pos = (status_rate->rate_idx.mcs << 4) | status_rate->rate_idx.nss; pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); he = (struct ieee80211_radiotap_he *)pos; he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); #define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) he->data6 |= HE_PREP(DATA6_NSTS, status_rate->rate_idx.nss); #define CHECK_GI(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ (int)NL80211_RATE_INFO_HE_GI_##s) CHECK_GI(0_8); CHECK_GI(1_6); CHECK_GI(3_2); he->data3 |= HE_PREP(DATA3_DATA_MCS, status_rate->rate_idx.mcs); he->data3 |= HE_PREP(DATA3_DATA_DCM, status_rate->rate_idx.he_dcm); he->data5 |= HE_PREP(DATA5_GI, status_rate->rate_idx.he_gi); switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_20: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); break; case RATE_INFO_BW_40: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); break; case RATE_INFO_BW_80: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); break; case RATE_INFO_BW_160: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); break; case RATE_INFO_BW_HE_RU: #define CHECK_RU_ALLOC(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) CHECK_RU_ALLOC(26); CHECK_RU_ALLOC(52); CHECK_RU_ALLOC(106); CHECK_RU_ALLOC(242); CHECK_RU_ALLOC(484); CHECK_RU_ALLOC(996); CHECK_RU_ALLOC(2x996); he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, status_rate->rate_idx.he_ru_alloc + 4); break; default: WARN_ONCE(1, "Invalid SU BW %d\n", status_rate->rate_idx.bw); } pos += sizeof(struct ieee80211_radiotap_he); } if (status_rate || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD) pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF; pos[2] = info->status.rates[0].idx; pos += 3; } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) *pos = 1; else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) *pos = 4; else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) *pos = 11; else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */ *pos = 0; pos++; /* u8 mcs_nss[4] */ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) | ieee80211_rate_get_vht_nss(&info->status.rates[0]); pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } } /* * Handles the tx for TDLS teardown frames. * If the frame wasn't ACKed by the peer - it will be re-sent through the AP */ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 flags) { struct sk_buff *teardown_skb; struct sk_buff *orig_teardown_skb; bool is_teardown = false; /* Get the teardown data we need and free the lock */ spin_lock(&sdata->u.mgd.teardown_lock); teardown_skb = sdata->u.mgd.teardown_skb; orig_teardown_skb = sdata->u.mgd.orig_teardown_skb; if ((skb == orig_teardown_skb) && teardown_skb) { sdata->u.mgd.teardown_skb = NULL; sdata->u.mgd.orig_teardown_skb = NULL; is_teardown = true; } spin_unlock(&sdata->u.mgd.teardown_lock); if (is_teardown) { /* This mechanism relies on being able to get ACKs */ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { dev_kfree_skb_any(teardown_skb); } else { tdls_dbg(sdata, "TDLS Resending teardown through AP\n"); ieee80211_subif_start_xmit(teardown_skb, skb->dev); } } } static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; if (skb->dev == sdata->dev) return sdata; } return NULL; } return rcu_dereference(local->p2p_sdata); } static void ieee80211_report_ack_skb(struct ieee80211_local *local, struct sk_buff *orig_skb, bool acked, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(orig_skb); struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); skb = idr_remove(&local->ack_status_frames, info->ack_frame_id); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) return; if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; bool is_valid_ack_signal = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); struct cfg80211_tx_status status = { .cookie = cookie, .buf = skb->data, .len = skb->len, .ack = acked, }; if (ieee80211_is_timing_measurement(orig_skb) || ieee80211_is_ftm(orig_skb)) { status.tx_tstamp = ktime_to_ns(skb_hwtstamps(orig_skb)->hwtstamp); status.ack_tstamp = ktime_to_ns(ack_hwtstamp); } rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { if (skb->protocol == sdata->control_port_protocol || skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, is_valid_ack_signal, GFP_ATOMIC); else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status_ext(&sdata->wdev, &status, GFP_ATOMIC); else pr_warn("Unknown status report in ack skb\n"); } rcu_read_unlock(); dev_kfree_skb_any(skb); } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ skb_complete_wifi_ack(skb, acked); } } static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; if (tx_time_est) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); rcu_read_unlock(); } if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; } else if (!dropped) { unsigned int hdr_size = ieee80211_hdrlen(hdr->frame_control); /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb, hdr_size) == WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); } else if (ieee80211_s1g_is_twt_setup(skb)) { if (!acked) { struct sk_buff *qskb; qskb = skb_clone(skb, GFP_ATOMIC); if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); wiphy_work_queue(local->hw.wiphy, &sdata->work); } } } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); } } rcu_read_unlock(); } else if (info->ack_frame_id) { ieee80211_report_ack_skb(local, skb, acked, dropped, ack_hwtstamp); } if (!dropped && skb->destructor) { skb->wifi_acked_valid = 1; skb->wifi_acked = acked; } ieee80211_led_tx(local); if (skb_has_frag_list(skb)) { kfree_skb_list(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } } /* * Use a static threshold for now, best value to be determined * by testing ... * Should it depend on: * - on # of retransmissions * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 #define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { unsigned long pkt_time = STA_LOST_PKT_TIME; unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; sta->deflink.status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; } /* * If we're in TDLS mode, make sure that all STA_LOST_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ if (sta->deflink.status_stats.lost_packets < pkt_thr || !time_after(jiffies, sta->deflink.status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, sta->deflink.status_stats.lost_packets, GFP_ATOMIC); sta->deflink.status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { int count = -1; int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) { /* just the first aggr frame carry status info */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } else if (info->status.rates[i].idx < 0) { break; } else if (i >= hw->max_report_rates) { /* the HW cannot have attempted that rate */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } count += info->status.rates[i].count; } if (count < 0) count = 0; *retry_count = count; return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, int shift, bool send_to_cooked, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; int rtap_len; /* send frame to monitor interfaces now */ rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, skb, retry_count, rtap_len, shift, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (!ieee80211_sdata_running(sdata)) continue; if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; netif_rx(skb2); } } prev_dev = sdata->dev; } } if (prev_dev) { skb->dev = prev_dev; netif_rx(skb); skb = NULL; } rcu_read_unlock(); dev_kfree_skb(skb); } static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_status *status, int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; fc = hdr->frame_control; if (status->sta) { sta = container_of(status->sta, struct sta_info, sta); shift = ieee80211_vif_get_shift(&sta->sdata->vif); if (info->flags & IEEE80211_TX_STATUS_EOSP) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); /* mesh Peer Service Period support */ if (ieee80211_vif_is_mesh(&sta->sdata->vif) && ieee80211_is_data_qos(fc)) ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->deflink.tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { u16 ssn; u8 *qc; qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) & IEEE80211_SCTL_SEQ); ieee80211_send_bar(&sta->sdata->vif, hdr->addr1, tid, ssn); } else if (ieee80211_is_data_qos(fc)) { u8 *qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; } if (!acked && ieee80211_is_back_req(fc)) { u16 control; /* * BAR failed, store the last SSN and retry sending * the BAR when the next unicast transmission on the * same TID succeeds. */ bar = (struct ieee80211_bar *) skb->data; control = le16_to_cpu(bar->control); if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) { u16 ssn = le16_to_cpu(bar->start_seq_num); tid = (control & IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; ieee80211_set_bar_pending(sta, tid, ssn); } } if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) sta->deflink.status_stats.msdu_failed[tid]++; sta->deflink.status_stats.msdu_retries[tid] += retry_count; } if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); } /* SNMP counters * Fragments are passed to low-level drivers as separate skbs, so these * are actually fragments, not frames. Update frame counters only for * the first fragment of the frame. */ if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU * with an individual address in the address 1 field or an MPDU * with a multicast address in the address 1 field of type Data * or Management. */ if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); /* this was a transmitted frame, but now we want to reuse it */ skb_orphan(skb); /* Need to make a copy before skb->cb gets cleared */ send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || !(ieee80211_is_data(fc)); /* * This is a bit racy but we can avoid a lot of work * with this test... */ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); return; } /* send to monitor interfaces */ ieee80211_tx_monitor(local, skb, retry_count, shift, send_to_cooked, status); } void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); if (sta) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status); void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct sk_buff *skb = status->skb; struct sta_info *sta = NULL; int rates_idx, retry_count; bool acked, noack_success, ack_signal_valid; u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (status->n_rates) sta->deflink.tx_stats.last_rate_info = status->rates[status->n_rates - 1].rate_idx; } if (skb && (tx_time_est = ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { /* Do this here to avoid the expensive lookup of the sta * in ieee80211_report_used_skb(). */ ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); } if (!status->info) goto free; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); ack_signal_valid = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); if (pubsta) { struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) sta->deflink.status_stats.retry_failed++; sta->deflink.status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) ieee80211_sta_tx_notify(sdata, (void *) skb->data, acked, info->status.tx_time); if (acked) { sta->deflink.status_stats.last_ack = jiffies; if (sta->deflink.status_stats.lost_packets) sta->deflink.status_stats.lost_packets = 0; /* Track when last packet was ACKed */ sta->deflink.status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && unlikely(sdata->u.mgd.probe_send_count > 0)) sdata->u.mgd.probe_send_count = 0; if (ack_signal_valid) { sta->deflink.status_stats.last_ack_signal = (s8)info->status.ack_signal; sta->deflink.status_stats.ack_signal_filled = true; ewma_avg_signal_add(&sta->deflink.status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume * that this TX packet failed because of that. */ if (skb) ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { ieee80211_lost_packet(sta, info); } } rate_control_tx_status(local, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, status); } if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) return __ieee80211_tx_status(hw, status, rates_idx, retry_count); if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { I802_DEBUG_INC(local->dot11FailedCount); } free: if (!skb) return; ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info) { struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_status status = { .info = info, .sta = pubsta, }; rate_control_tx_status(local, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sta->deflink.tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, num_packets, GFP_ATOMIC); } EXPORT_SYMBOL(ieee80211_report_low_ack); void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); ktime_t kt = ktime_set(0, 0); ieee80211_report_used_skb(local, skb, true, kt); dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs) { struct sk_buff *skb; while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); } |
3044 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC64_64_H #define _ASM_X86_ATOMIC64_64_H #include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> /* The 64-bit atomic type */ #define ATOMIC64_INIT(i) { (i) } static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __READ_ONCE((v)->counter); } static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __WRITE_ONCE(v->counter, i); } static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } #define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } #define arch_atomic64_dec arch_atomic64_dec static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } #define arch_atomic64_add_negative arch_atomic64_add_negative static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic64_add_return arch_atomic64_add_return static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } #define arch_atomic64_sub_return arch_atomic64_sub_return static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } #define arch_atomic64_xchg arch_atomic64_xchg static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) : "er" (i) : "memory"); } static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) : "er" (i) : "memory"); } static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) : "er" (i) : "memory"); } static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor #endif /* _ASM_X86_ATOMIC64_64_H */ |
3726 99 121 239 114 1 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/gfp_types.h> #include <linux/mmzone.h> #include <linux/topology.h> struct vm_area_struct; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the case of non-NUMA systems the NODE_DATA() gets optimized to * &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct list_head *page_list, struct page **page_array); unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array); /* Bulk allocate order-0 pages */ static inline unsigned long alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) { return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); } static inline unsigned long alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } static inline unsigned long alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); } static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) return; if (node_online(this_node)) return; pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); dump_stack(); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp_mask); return __alloc_pages(gfp_mask, order, nid, NULL); } static inline struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp); return __folio_alloc(gfp, order, nid, NULL); } /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node(nid, gfp_mask, order); } #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } #define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ folio_alloc(gfp, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) static inline struct page *alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); return &folio->page; } extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); __meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask); static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); } extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); static inline bool gfp_has_io_fs(gfp_t gfp) { return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); #endif /* __LINUX_GFP_H */ |
44 44 44 44 44 44 44 6172 6171 6172 44 44 44 44 44 1156 1156 1155 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Cryptographic Hash operations. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" #define MAX_SHASH_ALIGNMASK 63 static const struct crypto_type crypto_shash_type; static inline struct crypto_istat_hash *shash_get_stat(struct shash_alg *alg) { return hash_get_stat(&alg->halg); } static inline int crypto_shash_errstat(struct shash_alg *alg, int err) { return crypto_hash_errstat(&alg->halg, err); } int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } EXPORT_SYMBOL_GPL(shash_no_setkey); static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); unsigned long absize; u8 *buffer, *alignbuffer; int err; absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); err = shash->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return err; } static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg) { if (crypto_shash_alg_needs_key(alg)) crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if ((unsigned long)key & alignmask) err = shash_setkey_unaligned(tfm, key, keylen); else err = shash->setkey(tfm, key, keylen); if (unlikely(err)) { shash_set_needkey(tfm, shash); return err; } crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_setkey); static int shash_update_unaligned(struct shash_desc *desc, const u8 *data, unsigned int len) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); unsigned int unaligned_len = alignmask + 1 - ((unsigned long)data & alignmask); /* * We cannot count on __aligned() working for large values: * https://patchwork.kernel.org/patch/9507697/ */ u8 ubuf[MAX_SHASH_ALIGNMASK * 2]; u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); int err; if (WARN_ON(buf + unaligned_len > ubuf + sizeof(ubuf))) return -EINVAL; if (unaligned_len > len) unaligned_len = len; memcpy(buf, data, unaligned_len); err = shash->update(desc, buf, unaligned_len); memset(buf, 0, unaligned_len); return err ?: shash->update(desc, data + unaligned_len, len - unaligned_len); } int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_add(len, &shash_get_stat(shash)->hash_tlen); if ((unsigned long)data & alignmask) err = shash_update_unaligned(desc, data, len); else err = shash->update(desc, data, len); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_update); static int shash_final_unaligned(struct shash_desc *desc, u8 *out) { struct crypto_shash *tfm = desc->tfm; unsigned long alignmask = crypto_shash_alignmask(tfm); struct shash_alg *shash = crypto_shash_alg(tfm); unsigned int ds = crypto_shash_digestsize(tfm); /* * We cannot count on __aligned() working for large values: * https://patchwork.kernel.org/patch/9507697/ */ u8 ubuf[MAX_SHASH_ALIGNMASK + HASH_MAX_DIGESTSIZE]; u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); int err; if (WARN_ON(buf + ds > ubuf + sizeof(ubuf))) return -EINVAL; err = shash->final(desc, buf); if (err) goto out; memcpy(out, buf, ds); out: memset(buf, 0, ds); return err; } int crypto_shash_final(struct shash_desc *desc, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_inc(&shash_get_stat(shash)->hash_cnt); if ((unsigned long)out & alignmask) err = shash_final_unaligned(desc, out); else err = shash->final(desc, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_final); static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return shash_update_unaligned(desc, data, len) ?: shash_final_unaligned(desc, out); } int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = shash_get_stat(shash); atomic64_inc(&istat->hash_cnt); atomic64_add(len, &istat->hash_tlen); } if (((unsigned long)data | (unsigned long)out) & alignmask) err = shash_finup_unaligned(desc, data, len, out); else err = shash->finup(desc, data, len, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_finup); static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return crypto_shash_init(desc) ?: shash_update_unaligned(desc, data, len) ?: shash_final_unaligned(desc, out); } int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = shash_get_stat(shash); atomic64_inc(&istat->hash_cnt); atomic64_add(len, &istat->hash_tlen); } if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) err = -ENOKEY; else if (((unsigned long)data | (unsigned long)out) & alignmask) err = shash_digest_unaligned(desc, data, len, out); else err = shash->digest(desc, data, len, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_digest); int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out) { SHASH_DESC_ON_STACK(desc, tfm); int err; desc->tfm = tfm; err = crypto_shash_digest(desc, data, len, out); shash_desc_zero(desc); return err; } EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest); static int shash_default_export(struct shash_desc *desc, void *out) { memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm)); return 0; } static int shash_default_import(struct shash_desc *desc, const void *in) { memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(desc->tfm)); return 0; } static int shash_async_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { struct crypto_shash **ctx = crypto_ahash_ctx(tfm); return crypto_shash_setkey(*ctx, key, keylen); } static int shash_async_init(struct ahash_request *req) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return crypto_shash_init(desc); } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0; nbytes = crypto_hash_walk_done(&walk, nbytes)) nbytes = crypto_shash_update(desc, walk.data, nbytes); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_update); static int shash_async_update(struct ahash_request *req) { return shash_ahash_update(req, ahash_request_ctx(req)); } static int shash_async_final(struct ahash_request *req) { return crypto_shash_final(ahash_request_ctx(req), req->result); } int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; nbytes = crypto_hash_walk_first(req, &walk); if (!nbytes) return crypto_shash_final(desc, req->result); do { nbytes = crypto_hash_walk_last(&walk) ? crypto_shash_finup(desc, walk.data, nbytes, req->result) : crypto_shash_update(desc, walk.data, nbytes); nbytes = crypto_hash_walk_done(&walk, nbytes); } while (nbytes > 0); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_finup); static int shash_async_finup(struct ahash_request *req) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return shash_ahash_finup(req, desc); } int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) { unsigned int nbytes = req->nbytes; struct scatterlist *sg; unsigned int offset; int err; if (nbytes && (sg = req->src, offset = sg->offset, nbytes <= min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) { void *data; data = kmap_local_page(sg_page(sg)); err = crypto_shash_digest(desc, data + offset, nbytes, req->result); kunmap_local(data); } else err = crypto_shash_init(desc) ?: shash_ahash_finup(req, desc); return err; } EXPORT_SYMBOL_GPL(shash_ahash_digest); static int shash_async_digest(struct ahash_request *req) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return shash_ahash_digest(req, desc); } static int shash_async_export(struct ahash_request *req, void *out) { return crypto_shash_export(ahash_request_ctx(req), out); } static int shash_async_import(struct ahash_request *req, const void *in) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return crypto_shash_import(desc, in); } static void crypto_exit_shash_ops_async(struct crypto_tfm *tfm) { struct crypto_shash **ctx = crypto_tfm_ctx(tfm); crypto_free_shash(*ctx); } int crypto_init_shash_ops_async(struct crypto_tfm *tfm) { struct crypto_alg *calg = tfm->__crt_alg; struct shash_alg *alg = __crypto_shash_alg(calg); struct crypto_ahash *crt = __crypto_ahash_cast(tfm); struct crypto_shash **ctx = crypto_tfm_ctx(tfm); struct crypto_shash *shash; if (!crypto_mod_get(calg)) return -EAGAIN; shash = crypto_create_tfm(calg, &crypto_shash_type); if (IS_ERR(shash)) { crypto_mod_put(calg); return PTR_ERR(shash); } *ctx = shash; tfm->exit = crypto_exit_shash_ops_async; crt->init = shash_async_init; crt->update = shash_async_update; crt->final = shash_async_final; crt->finup = shash_async_finup; crt->digest = shash_async_digest; if (crypto_shash_alg_has_setkey(alg)) crt->setkey = shash_async_setkey; crypto_ahash_set_flags(crt, crypto_shash_get_flags(shash) & CRYPTO_TFM_NEED_KEY); crt->export = shash_async_export; crt->import = shash_async_import; crt->reqsize = sizeof(struct shash_desc) + crypto_shash_descsize(shash); return 0; } struct crypto_ahash *crypto_clone_shash_ops_async(struct crypto_ahash *nhash, struct crypto_ahash *hash) { struct crypto_shash **nctx = crypto_ahash_ctx(nhash); struct crypto_shash **ctx = crypto_ahash_ctx(hash); struct crypto_shash *shash; shash = crypto_clone_shash(*ctx); if (IS_ERR(shash)) { crypto_free_ahash(nhash); return ERR_CAST(shash); } *nctx = shash; return nhash; } static void crypto_shash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); alg->exit_tfm(hash); } static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); int err; hash->descsize = alg->descsize; shash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_shash_exit_tfm; if (!alg->init_tfm) return 0; err = alg->init_tfm(hash); if (err) return err; /* ->init_tfm() may have increased the descsize. */ if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) { if (alg->exit_tfm) alg->exit_tfm(hash); return -EINVAL; } return 0; } static void crypto_shash_free_instance(struct crypto_instance *inst) { struct shash_instance *shash = shash_instance(inst); shash->free(shash); } static int __maybe_unused crypto_shash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) { struct shash_alg *salg = __crypto_shash_alg(alg); seq_printf(m, "type : shash\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", salg->digestsize); } static int __maybe_unused crypto_shash_report_stat( struct sk_buff *skb, struct crypto_alg *alg) { return crypto_hash_report_stat(skb, alg, "shash"); } static const struct crypto_type crypto_shash_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_shash_init_tfm, .free = crypto_shash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_shash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_shash_report, #endif #ifdef CONFIG_CRYPTO_STATS .report_stat = crypto_shash_report_stat, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SHASH, .tfmsize = offsetof(struct crypto_shash, base), }; int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_shash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_shash); struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_shash); int crypto_has_shash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_shash); struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash) { struct crypto_tfm *tfm = crypto_shash_tfm(hash); struct shash_alg *alg = crypto_shash_alg(hash); struct crypto_shash *nhash; int err; if (!crypto_shash_alg_has_setkey(alg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init)) return ERR_PTR(-ENOSYS); nhash = crypto_clone_tfm(&crypto_shash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->descsize = hash->descsize; if (alg->clone_tfm) { err = alg->clone_tfm(nhash, hash); if (err) { crypto_free_shash(nhash); return ERR_PTR(err); } } return nhash; } EXPORT_SYMBOL_GPL(crypto_clone_shash); int hash_prepare_alg(struct hash_alg_common *alg) { struct crypto_istat_hash *istat = hash_get_stat(alg); struct crypto_alg *base = &alg->base; if (alg->digestsize > HASH_MAX_DIGESTSIZE) return -EINVAL; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) memset(istat, 0, sizeof(*istat)); return 0; } static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->descsize > HASH_MAX_DESCSIZE) return -EINVAL; if (base->cra_alignmask > MAX_SHASH_ALIGNMASK) return -EINVAL; if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_shash_type; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; if (!alg->finup) alg->finup = shash_finup_unaligned; if (!alg->digest) alg->digest = shash_digest_unaligned; if (!alg->export) { alg->export = shash_default_export; alg->import = shash_default_import; alg->halg.statesize = alg->descsize; } if (!alg->setkey) alg->setkey = shash_no_setkey; return 0; } int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = shash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_shash); void crypto_unregister_shash(struct shash_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_shash); int crypto_register_shashes(struct shash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_shash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_shash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_shashes); void crypto_unregister_shashes(struct shash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_shash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_shashes); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = shash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, shash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(shash_register_instance); void shash_free_singlespawn_instance(struct shash_instance *inst) { crypto_drop_spawn(shash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous cryptographic hash type"); |
6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * Lauro Ramos Venancio <lauro.venancio@openbossa.org> */ #include <linux/nfc.h> #include <linux/module.h> #include "nfc.h" static DEFINE_RWLOCK(proto_tab_lock); static const struct nfc_protocol *proto_tab[NFC_SOCKPROTO_MAX]; static int nfc_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int rc = -EPROTONOSUPPORT; if (net != &init_net) return -EAFNOSUPPORT; if (proto < 0 || proto >= NFC_SOCKPROTO_MAX) return -EINVAL; read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); return rc; } static const struct net_proto_family nfc_sock_family_ops = { .owner = THIS_MODULE, .family = PF_NFC, .create = nfc_sock_create, }; int nfc_proto_register(const struct nfc_protocol *nfc_proto) { int rc; if (nfc_proto->id < 0 || nfc_proto->id >= NFC_SOCKPROTO_MAX) return -EINVAL; rc = proto_register(nfc_proto->proto, 0); if (rc) return rc; write_lock(&proto_tab_lock); if (proto_tab[nfc_proto->id]) rc = -EBUSY; else proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); if (rc) proto_unregister(nfc_proto->proto); return rc; } EXPORT_SYMBOL(nfc_proto_register); void nfc_proto_unregister(const struct nfc_protocol *nfc_proto) { write_lock(&proto_tab_lock); proto_tab[nfc_proto->id] = NULL; write_unlock(&proto_tab_lock); proto_unregister(nfc_proto->proto); } EXPORT_SYMBOL(nfc_proto_unregister); int __init af_nfc_init(void) { return sock_register(&nfc_sock_family_ops); } void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); } |
5792 5801 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGALLOC_TRACK_H #define _LINUX_PGALLOC_TRACK_H #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pgd_none(*pgd))) { if (__p4d_alloc(mm, pgd, address)) return NULL; *mod_mask |= PGTBL_PGD_MODIFIED; } return p4d_offset(pgd, address); } static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(p4d_none(*p4d))) { if (__pud_alloc(mm, p4d, address)) return NULL; *mod_mask |= PGTBL_P4D_MODIFIED; } return pud_offset(p4d, address); } static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pud_none(*pud))) { if (__pmd_alloc(mm, pud, address)) return NULL; *mod_mask |= PGTBL_PUD_MODIFIED; } return pmd_offset(pud, address); } #endif /* CONFIG_MMU */ #define pte_alloc_kernel_track(pmd, address, mask) \ ((unlikely(pmd_none(*(pmd))) && \ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ NULL: pte_offset_kernel(pmd, address)) #endif /* _LINUX_PGALLOC_TRACK_H */ |
510 497 387 387 386 294 310 310 310 64 65 1 301 301 301 339 334 339 339 338 149 31 31 15 31 30 31 17 17 17 30 14 3 1 2 31 31 20 16 31 30 30 31 31 31 31 31 31 18 31 31 31 290 290 290 64 2 290 290 235 9 235 9 4 219 121 282 4 2 2 2 64 56 56 6 51 56 30 27 2 6 8 2 6 178 178 178 33 33 178 178 178 149 149 116 19 149 288 12 288 194 232 232 11 225 4 34 34 2 2 2 194 194 198 198 70 260 260 255 255 255 255 65 255 255 255 64 5 1 1 4 4 4 4 4 4 4 315 315 315 266 290 14 14 4 290 289 13 3 3 1 2 10 12 12 1 11 259 259 259 42 6 5 6 5 5 5 6 290 43 43 5 2 784 783 786 559 699 786 786 784 785 785 785 785 786 786 440 440 786 786 786 784 46 46 46 46 46 45 31 28 28 6 46 46 6 6 3 1 46 132 132 132 132 132 5 5 131 145 6 145 145 145 145 145 145 145 145 145 145 145 36 145 127 127 145 145 1 1 145 145 125 109 109 145 145 145 121 121 120 121 120 89 33 145 145 145 174 174 149 149 174 173 174 146 146 145 144 174 149 145 174 174 141 174 174 174 173 174 174 174 174 174 1 141 509 510 510 510 509 24 510 510 510 509 510 510 510 510 510 1 510 509 508 510 510 510 24 24 509 509 499 499 509 510 480 182 182 473 480 480 474 473 145 474 10 10 474 480 481 481 480 474 182 17 17 17 17 169 169 169 169 169 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; if (cb->strict_check) { int err; err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) return err; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; /* * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; } if (!cb->args[0]) { res = fib6_dump_table(tb, skb, cb); if (!res) cb->args[0] = 1; } goto out; } s_h = cb->args[0]; s_e = cb->args[1]; rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); if (res != 0) goto out_unlock; next: e++; } } out_unlock: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; out: res = res < 0 ? res : skb->len; if (res <= 0) fib6_dump_end(cb); return res; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires_locked(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires_locked(iter); else fib6_set_expires_locked(iter, rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } pn = fn; #ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (fib6_has_expires(rt)) hlist_add_head(&rt->gc_link, &table->tb6_gc_hlist); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 if (!pn_leaf) { WARN_ON(!pn_leaf); pn_leaf = info->nl_net->ipv6.fib6_null_entry; } #endif fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; RT6_TRACE("fib6_del_route\n"); /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (fib6_has_expires(rt) && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */ |
1 2326 2327 2327 2327 2327 2327 2327 2327 2327 2327 2327 2326 2326 2327 2327 1974 1973 1975 1955 1955 1954 1953 1933 1933 1933 543 543 1954 1953 1954 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 | /* * Ext4 orphan inode handling */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int looped = 0; /* * Find block with free orphan entry. Use CPU number for a naive hash * for a search start in the orphan file */ start = raw_smp_processor_id()*13 % oi->of_blocks; i = start; do { if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) >= 0) { found = true; break; } if (++i >= oi->of_blocks) i = 0; } while (i != start); if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional * credits we would have to reserve for each orphan inode * operation just don't seem worth it. */ return -ENOSPC; } ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); /* Find empty slot in a block */ j = 0; do { if (looped) { /* * Did we walk through the block several times without * finding free entry? It is theoretically possible * if entries get constantly allocated and freed or * if the block is corrupted. Avoid indefinite looping * and bail. We'll use orphan list instead. */ if (looped > 3) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return -ENOSPC; } cond_resched(); } while (bdata[j]) { if (++j >= inodes_per_ob) { j = 0; looped++; } } } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != (__le32)0); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; bool dirty = false; if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); /* * Inode orphaned in orphan file or in orphan list? */ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan)) return 0; /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); if (sbi->s_orphan_info.of_blocks) { err = ext4_orphan_file_add(handle, inode); /* * Fallback to normal orphan list of orphan file is * out of space */ if (err != -ENOSPC) return err; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); mutex_unlock(&sbi->s_orphan_lock); if (dirty) { err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; if (err) { /* * We have to remove inode from in-memory list if * addition to on disk orphan list failed. Stray orphan * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } else brelse(iloc.bh); ext4_debug("superblock will point to %lu\n", inode->i_ino); ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); return err; } static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) { struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; __le32 *bdata; int blk, off; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int ret = 0; if (!handle) goto out; blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; if (WARN_ON_ONCE(blk >= oi->of_blocks)) goto out; ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); bdata[off] = 0; atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); return ret; } /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. */ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; if (handle) { /* Grab inode buffer early before taking global s_orphan_lock */ err = ext4_reserve_inode_write(handle, inode, &iloc); } mutex_lock(&sbi->s_orphan_lock); ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ if (!handle || err) { mutex_unlock(&sbi->s_orphan_lock); goto out_err; } ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); out_err: ext4_std_error(inode->i_sb, err); return err; out_brelse: brelse(iloc.bh); goto out_err; } #ifdef CONFIG_QUOTA static int ext4_quota_on_mount(struct super_block *sb, int type) { return dquot_quota_on_mount(sb, rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], lockdep_is_held(&sb->s_umount)), EXT4_SB(sb)->s_jquota_fmt, type); } #endif static void ext4_process_orphan(struct inode *inode, int *nr_truncates, int *nr_orphans) { struct super_block *sb = inode->i_sb; int ret; dquot_initialize(inode); if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); ext4_debug("truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); if (ret) { /* * We need to clean up the in-core orphan list * manually if ext4_truncate() failed to get a * transaction handle. */ ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); } inode_unlock(inode); (*nr_truncates)++; } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); ext4_debug("deleting unreferenced inode %lu\n", inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ } /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these * inodes at recovery time (only with a read-write filesystem). * * In order to keep the orphan inode chain consistent during traversal (in * case of crash during recovery), we link each inode into the superblock * orphan list_head and handle it the same way as an inode deletion during * normal operation (which journals the operations for us). * * We only do an iget() and an iput() on each inode, which is very safe if we * accidentally point at an in-use or already deleted inode. The worst that * can happen in this case is that we get a "bit already cleared" message from * ext4_free_inode(). The only reason we would point at a wrong inode is if * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; struct inode *inode; int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; #endif __le32 *bdata; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { ext4_debug("no orphan inodes to clean up\n"); return; } if (bdev_read_only(sb->s_bdev)) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, skipping orphan cleanup"); return; } /* Check if feature set would not allow a r/w mount */ if (!ext4_feature_set_ok(sb, 0)) { ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " "unknown ROCOMPAT features"); return; } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", ret); } /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " "quota: type %d: error %d", i, ret); } } #endif while (es->s_last_orphan) { /* * We may have encountered an error during cleanup; if * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } for (i = 0; i < oi->of_blocks; i++) { bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); for (j = 0; j < inodes_per_ob; j++) { if (!bdata[j]) continue; inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); if (IS_ERR(inode)) continue; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", PLURAL(nr_orphans)); if (nr_truncates) ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn off quotas if they were enabled for orphan cleanup */ if (quota_update) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } } #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } void ext4_release_orphan_info(struct super_block *sb) { int i; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; if (!oi->of_blocks) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( struct super_block *sb, struct buffer_head *bh) { return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)); } static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct buffer_head *bh) { __u32 calculated; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); if (!ext4_has_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } /* This gets called only when checksumming is enabled */ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { struct super_block *sb = EXT4_TRIGGER(triggers)->sb; __u32 csum; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } int ext4_init_orphan_info(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct inode *inode; int i, j; int ret; int free; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); if (!ext4_has_feature_orphan_file(sb)) return 0; inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; } for (i = 0; i < oi->of_blocks; i++) { oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); if (IS_ERR(oi->of_binfo[i].ob_bh)) { ret = PTR_ERR(oi->of_binfo[i].ob_bh); goto out_free; } if (!oi->of_binfo[i].ob_bh) { ret = -EIO; goto out_free; } ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { ext4_error(sb, "orphan file block %d: bad magic", i); ret = -EIO; goto out_free; } if (!ext4_orphan_file_block_csum_verify(sb, oi->of_binfo[i].ob_bh)) { ext4_error(sb, "orphan file block %d: bad checksum", i); ret = -EIO; goto out_free; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); free = 0; for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); out_put: iput(inode); return ret; } int ext4_orphan_file_empty(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int i; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) if (atomic_read(&oi->of_binfo[i].ob_free_entries) != inodes_per_ob) return 0; return 1; } |
17 17 17 17 17 52 52 1 8 1 57 2 2 57 51 51 1 2 1501 1501 104 1 1 2 1 1 1 1 1 2 17 17 17 3 2 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IP multicast routing support for mrouted 3.6/3.8 * * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> * Linux Consultancy and Custom Driver Development * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code * Alan Cox : Fixed the clone/copy bug and device race. * Mike McLagan : Routing by source * Malcolm Beattie : Buffer handling fixes. * Alexey Kuznetsov : Double buffer free and other fixes. * SVR Anand : Fixed several multicast bugs and problems. * Alexey Kuznetsov : Status, optimisations and more. * Brad Parker : Better behaviour on mrouted upcall * overflow. * Carlos Picoto : PIMv1 Support * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header * Relax this requirement to work with older peers. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/cache.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mroute.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/route.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> #include <linux/rhashtable.h> #include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/rtnh.h> #include <linux/nospec.h> struct ipmr_rule { struct fib_rule common; }; struct ipmr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. * Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved * entries is changed only in process context and protected * with weak lock mrt_lock. Queue of unresolved entries is protected * with strong spinlock mfc_unres_lock. * * In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *cache, int local); static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES #define ipmr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv4.mr_tables)) static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv4.mr_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv4.mr_tables) return NULL; return ret; } static struct mr_table *ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ipmr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { int err; struct ipmr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi4_to_flowi(flp4)); err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ipmr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = ipmr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { return 1; } static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), .action = ipmr_rule_action, .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .owner = THIS_MODULE, }; static int __net_init ipmr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ipmr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv4.mr_tables); mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0); if (err < 0) goto err2; net->ipv4.mr_rules_ops = ops; return 0; err2: rtnl_lock(); ipmr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); } bool ipmr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; } EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv4.mrt; return NULL; } static struct mr_table *ipmr_get_table(struct net *net, u32 id) { return net->ipv4.mrt; } static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { *mrt = net->ipv4.mrt; return 0; } static int __net_init ipmr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv4.mrt = mrt; return 0; } static void __net_exit ipmr_rules_exit(struct net *net) { ASSERT_RTNL(); ipmr_free_table(net->ipv4.mrt); net->ipv4.mrt = NULL; } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ipmr_rules_seq_read(struct net *net) { return 0; } bool ipmr_rule_default(const struct fib_rule *rule) { return true; } EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc_cache_cmp_arg *cmparg = arg->key; const struct mfc_cache *c = ptr; return cmparg->mfc_mcastgrp != c->mfc_mcastgrp || cmparg->mfc_origin != c->mfc_origin; } static const struct rhashtable_params ipmr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ipmr_hash_cmp, .automatic_shrinking = true, }; static void ipmr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); #endif } static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = { .mfc_mcastgrp = htonl(INADDR_ANY), .mfc_origin = htonl(INADDR_ANY), }; static struct mr_table_ops ipmr_mr_table_ops = { .rht_params = &ipmr_rht_params, .cmparg_any = &ipmr_mr_table_ops_cmparg_any, }; static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (id != RT_TABLE_DEFAULT && id >= 1000000000) return ERR_PTR(-EINVAL); mrt = ipmr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ipmr_mr_table_ops, ipmr_expire_process, ipmr_new_table_set); } static void ipmr_free_table(struct mr_table *mrt) { timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ /* Initialize ipmr pimreg/tunnel in_device */ static bool ipmr_init_vif_indev(const struct net_device *dev) { struct in_device *in_dev; ASSERT_RTNL(); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return false; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; return true; } static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; struct ip_tunnel_parm p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); if (!tunnel_dev) goto out; p.iph.daddr = v->vifc_rmt_addr.s_addr; p.iph.saddr = v->vifc_lcl_addr.s_addr; p.iph.version = 4; p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPIP; sprintf(p.name, "dvmrp%d", v->vifc_vifi); if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl) goto out; err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCADDTUNNEL); if (err) goto out; new_dev = __dev_get_by_name(net, p.name); if (!new_dev) goto out; new_dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(new_dev)) goto out_unregister; if (dev_open(new_dev, NULL)) goto out_unregister; dev_hold(new_dev); err = dev_set_allmulti(new_dev, 1); if (err) { dev_close(new_dev); tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCDELTUNNEL); dev_put(new_dev); new_dev = ERR_PTR(err); } return new_dev; out_unregister: unregister_netdevice(new_dev); out: return ERR_PTR(-ENOBUFS); } #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { kfree_skb(skb); return err; } DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), IGMPMSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; dev->features |= NETIF_F_NETNS_LOCAL; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT_TABLE_DEFAULT) sprintf(name, "pimreg"); else sprintf(name, "pimreg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (!ipmr_init_vif_indev(dev)) goto failure; if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } /* called with rcu_read_lock() */ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, unsigned int pimlen) { struct net_device *reg_dev = NULL; struct iphdr *encap; int vif_num; encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* Check that: * a. packet is really sent to a multicast group * b. packet is not a NULL-REGISTER * c. packet is not truncated */ if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + pimlen > skb->len) return 1; /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */ vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[vif_num]); if (!reg_dev) return 1; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return NET_RX_SUCCESS; } #else static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { return NULL; } #endif static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, vifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv4.ipmr_seq); } static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type, &mfc->_c, tb_id, &net->ipv4.ipmr_seq); } /** * vif_delete - Delete a VIF entry * @mrt: Table to delete from * @vifi: VIF identifier to delete * @notify: Set to 1, if the caller is a notifier_call * @head: if unregistering the VIF, place it on this queue */ static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; spin_lock(&mrt_lock); call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); RCU_INIT_POINTER(v->dev, NULL); if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); } if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static void ipmr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } static void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. */ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; struct nlmsgerr *e; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); e = nlmsg_data(nlh); e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { kfree_skb(skb); } } ipmr_cache_free(c); } /* Timer process for the unresolved queue. */ static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); struct mr_mfc *c, *next; unsigned long expires; unsigned long now; if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); return; } if (list_empty(&mrt->mfc_unres_queue)) goto out; now = jiffies; expires = 10*HZ; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE); ipmr_destroy_unres(mrt, (struct mfc_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); out: spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXVIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXVIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } cache->mfc_un.res.lastuse = jiffies; } static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { struct netdev_phys_item_id ppid = { }; int vifi = vifc->vifc_vifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->vifc_flags) { case VIFF_REGISTER: if (!ipmr_pimsm_enabled()) return -EINVAL; /* Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ipmr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; case VIFF_TUNNEL: dev = ipmr_new_tunnel(net, vifc); if (IS_ERR(dev)) return PTR_ERR(dev); break; case VIFF_USE_IFINDEX: case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); if (dev && !__in_dev_get_rtnl(dev)) { dev_put(dev); return -EADDRNOTAVAIL; } } else { dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); } if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) { dev_put(dev); return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), (VIFF_TUNNEL | VIFF_REGISTER)); err = dev_get_port_parent_id(dev, &ppid, true); if (err == 0) { memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len); v->dev_parent_id.id_len = ppid.id_len; } else { v->dev_parent_id.id_len = 0; } v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); if (v->flags & VIFF_REGISTER) { /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); } if (vifi+1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } /* called with rcu_read_lock() */ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = origin }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, __be32 mcastgrp, int vifi) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = htonl(INADDR_ANY) }; if (mcastgrp == htonl(INADDR_ANY)) return mr_mfc_find_any_parent(mrt, vifi); return mr_mfc_find_any(mrt, vifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, __be32 origin, __be32 mcastgrp, int parent) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = origin, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c) { c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXVIFS; c->_c.free = ipmr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; } static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c) { skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; } return c; } /* A cache entry has gone into a resolved state from queued */ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc_cache *uc, struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); e = nlmsg_data(nlh); e->error = -EMSGSIZE; memset(&e->msg, 0, sizeof(e->msg)); } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip_mr_forward(net, mrt, skb->dev, skb, c, 0); rcu_read_unlock(); } } } /* Bounce a cache query up to mrouted and netlink. * * Called under rcu_read_lock(). */ static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { const int ihl = ip_hdrlen(pkt); struct sock *mroute_sk; struct igmphdr *igmp; struct igmpmsg *msg; struct sk_buff *skb; int ret; if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); if (!skb) return -ENOBUFS; if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and * to set msg->im_mbz to "mbz" :-) */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; } else { /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num); msg->im_vif = vif_num; msg->im_vif_hi = vif_num >> 8; } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); } else { /* Copy the IP header */ skb_set_network_header(skb, skb->len); skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); /* Flag to the kernel this is a route add */ ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; igmp->code = 0; ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } mroute_sk = rcu_dereference(mrt->mroute_sk); if (!mroute_sk) { kfree_skb(skb); return -EINVAL; } igmpmsg_netlink_event(mrt, skb); /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ /* Called under rcu_read_lock() */ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); struct mfc_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { found = true; break; } } if (!found) { /* Create a new entry if allowable */ c = ipmr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ipmr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) mod_timer(&mrt->ipmr_expire_timer, c->_c.mfc_un.unres.expires); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* MFC cache manipulation by user space mroute daemon */ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params); list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { struct mfc_cache *uc, *c; struct mr_mfc *_uc; bool found; int ret; if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) && !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c = ipmr_cache_alloc(); if (!c) return -ENOMEM; c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ipmr_rht_params); if (ret) { pr_err("ipmr: rhtable insert error %d\n", ret); ipmr_cache_free(c); return ret; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc_cache *)_uc; if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct net *net = read_pnet(&mrt->net); struct mr_mfc *c, *tmp; struct mfc_cache *cache; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) continue; vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); cache = (struct mfc_cache *)c; call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); mroute_netlink_event(mrt, cache, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); cache = (struct mfc_cache *)c; mroute_netlink_event(mrt, cache, RTM_DELROUTE); ipmr_destroy_unres(mrt, cache); } spin_unlock_bh(&mfc_unres_lock); } } } /* called from ip_ra_control(), before an RCU grace period, * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct mr_table *mrt; rtnl_lock(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); } } rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { struct net *net = sock_net(sk); int val, ret = 0, parent = 0; struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ rtnl_lock(); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_IGMP) { ret = -EOPNOTSUPP; goto out_unlock; } mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) { ret = -ENOENT; goto out_unlock; } if (optname != MRT_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) { ret = -EACCES; goto out_unlock; } } switch (optname) { case MRT_INIT: if (optlen != sizeof(int)) { ret = -EINVAL; break; } if (rtnl_dereference(mrt->mroute_sk)) { ret = -EADDRINUSE; break; } ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); } break; case MRT_DONE: if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { /* We need to unlock here because mrtsock_destruct takes * care of rtnl itself and we can't change that due to * the IP_ROUTER_ALERT setsockopt which runs without it. */ rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); goto out; } break; case MRT_ADD_VIF: case MRT_DEL_VIF: if (optlen != sizeof(vif)) { ret = -EINVAL; break; } if (copy_from_sockptr(&vif, optval, sizeof(vif))) { ret = -EFAULT; break; } if (vif.vifc_vifi >= MAXVIFS) { ret = -ENFILE; break; } if (optname == MRT_ADD_VIF) { ret = vif_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } break; /* Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; fallthrough; case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { ret = -EINVAL; break; } if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) { ret = -EFAULT; break; } if (parent == 0) parent = mfc.mfcc_parent; if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); break; case MRT_FLUSH: if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } mroute_clean_tables(mrt, val); break; /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } mrt->mroute_do_assert = val; break; case MRT_PIM: if (!ipmr_pimsm_enabled()) { ret = -ENOPROTOOPT; break; } if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) { ret = -ENOPROTOOPT; break; } if (optlen != sizeof(uval)) { ret = -EINVAL; break; } if (copy_from_sockptr(&uval, optval, sizeof(uval))) { ret = -EFAULT; break; } if (sk == rtnl_dereference(mrt->mroute_sk)) { ret = -EBUSY; } else { mrt = ipmr_new_table(net, uval); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw_sk(sk)->ipmr_table = uval; } break; /* Spurious command, or MRT_VERSION which you cannot set. */ default: ret = -ENOPROTOOPT; } out_unlock: rtnl_unlock(); out: return ret; } /* Execute if this ioctl is a special mroute ioctl */ int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { switch (cmd) { /* These userspace buffers will be consumed by ipmr_ioctl() */ case SIOCGETVIFCNT: { struct sioc_vif_req buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } case SIOCGETSGCNT: { struct sioc_sg_req buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } } /* return code > 0 means that the ioctl was not executed */ return 1; } /* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_IGMP) return -EOPNOTSUPP; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (optname) { case MRT_VERSION: val = 0x0305; break; case MRT_PIM: if (!ipmr_pimsm_enabled()) return -ENOPROTOOPT; val = mrt->mroute_do_pim; break; case MRT_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(unsigned int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* The IP multicast ioctl support routines. */ int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); struct sioc_vif_req *vr; struct sioc_sg_req *sr; struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: vr = (struct sioc_vif_req *)arg; if (vr->vifi >= mrt->maxvif) return -EINVAL; vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->vifi]; if (VIF_EXISTS(mrt, vr->vifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: sr = (struct sioc_sg_req *)arg; rcu_read_lock(); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { sr->pktcnt = c->_c.mfc_un.res.pkt; sr->bytecnt = c->_c.mfc_un.res.bytes; sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req { struct in_addr src; struct in_addr grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_vif_req { vifi_t vifi; /* Which iface */ compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req sr; struct compat_sioc_vif_req vr; struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->_c.mfc_un.res.pkt; sr.bytecnt = c->_c.mfc_un.res.bytes; sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ipmr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) vif_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static struct notifier_block ip_mr_notifier = { .notifier_call = ipmr_device_event, }; /* Encapsulate a packet by attaching a valid IPIP header to it. * This avoids tunnel drivers and other mess and gives us the speed so * important for multicast video. */ static void ip_encap(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; const struct iphdr *old_iph = ip_hdr(skb); skb_push(skb, sizeof(struct iphdr)); skb->transport_header = skb->network_header; skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->tos = old_iph->tos; iph->ttl = old_iph->ttl; iph->frag_off = 0; iph->daddr = daddr; iph->saddr = saddr; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); ip_select_ident(net, skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); return dst_output(net, sk, skb); } #ifdef CONFIG_NET_SWITCHDEV static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, int in_vifi, int out_vifi) { struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; return netdev_phys_item_id_same(&out_vif->dev_parent_id, &in_vif->dev_parent_id); } #else static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, int in_vifi, int out_vifi) { return false; } #endif /* Processing handlers for ipmr_forward, under rcu_read_lock() */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; if (vif->flags & VIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) goto out_free; if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, RT_TOS(iph->tos), vif->link); if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); } else { rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, RT_TOS(iph->tos), vif->link); if (IS_ERR(rt)) goto out_free; } dev = rt->dst.dev; if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear * to blackhole. */ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); goto out_free; } WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. * What do we do with netfilter? -- RR */ if (vif->flags & VIFF_TUNNEL) { ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ DEV_STATS_INC(vif_dev, tx_packets); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); } IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; out_free: kfree_skb(skb); } /* Called with mrt_lock or rcu_read_lock() */ static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* "local" means that we should preserve one skb (for local delivery) */ /* Called uner rcu_read_lock() */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *c, int local) { int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; c->_c.mfc_un.res.bytes += skb->len; c->_c.mfc_un.res.lastuse = jiffies; if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... * * The best workaround until routing daemons will be * fixed is not to redistribute packet, if it was * send through wrong interface. It means, that * multicast applications WILL NOT work for * (S,G), which have default multicast route pointing * to wrong oif. In any case, it is not a good * idea to use multicasting applications on router. */ goto dont_forward; } c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, * so that we cannot check that packet arrived on an oif. * It is bad, but otherwise we would need to move pretty * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); if (mrt->mroute_do_wrvifwhole) ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRVIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* Forward the frame */ if (c->mfc_origin == htonl(INADDR_ANY) && c->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((c->mfc_origin != htonl(INADDR_ANY) || ct != true_vifi) && ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, psend); } else { ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); return; } } dont_forward: if (!local) kfree_skb(skb); } static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = RT_TOS(iph->tos), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? LOOPBACK_IFINDEX : skb->dev->ifindex), .flowi4_mark = skb->mark, }; struct mr_table *mrt; int err; err = ipmr_fib_lookup(net, &fl4, &mrt); if (err) return ERR_PTR(err); return mrt; } /* Multicast packets for forwarding arrive here * Called with rcu_read_lock(); */ int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; struct net_device *dev; /* skb->dev passed in is the loX master dev for vrfs. * As there are no vifs associated with loopback devices, * get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. */ if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) { kfree_skb(skb); return PTR_ERR(mrt); } if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) return 0; } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { /* IGMPv1 (and broken IGMPv2 implementations sort of * Cisco IOS <= 11.2(8)) do not put router alert * option to IGMP packets destined to routable * groups. It is very bad, because it means * that we can forward NO IGMP messages. */ struct sock *mroute_sk; mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } } } /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, vif); } /* No usable cache entry */ if (!cache) { int vif; if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); if (!skb2) return -ENOBUFS; skb = skb2; } vif = ipmr_find_vif(mrt, dev); if (vif >= 0) return ipmr_cache_unresolved(mrt, vif, skb, dev); kfree_skb(skb); return -ENODEV; } ip_mr_forward(net, mrt, dev, skb, cache, local); if (local) return ip_local_deliver(skb); return 0; dont_forward: if (local) return ip_local_deliver(skb); kfree_skb(skb); return 0; } #ifdef CONFIG_IP_PIMSM_V1 /* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = igmp_hdr(skb); mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } return 0; } #endif #ifdef CONFIG_IP_PIMSM_V2 static int pim_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct net *net = dev_net(skb->dev); struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) || (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } return 0; } #endif int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; int err; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; rcu_read_lock(); cache = ipmr_cache_find(mrt, saddr, daddr); if (!cache && skb->dev) { int vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, daddr, vif); } if (!cache) { struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; int vif = -1; dev = skb->dev; if (dev) vif = ipmr_find_vif(mrt, dev); if (vif < 0) { rcu_read_unlock(); return -ENODEV; } skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_push(skb2, sizeof(struct iphdr)); skb_reset_network_header(skb2); iph = ip_hdr(skb2); iph->ihl = sizeof(struct iphdr) >> 2; iph->saddr = saddr; iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IPMR; rtm->rtm_dst_len = 32; rtm->rtm_src_len = 32; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c, cmd, flags); } static size_t mroute_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_SRC */ + nla_total_size(4) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } static size_t igmpmsg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IPMRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct igmpmsg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct igmpmsg); msg = (struct igmpmsg *)skb_network_header(pkt); skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IPMR; if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) || nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, msg->im_dst.s_addr) || nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); } static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_TABLE: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request"); return -EINVAL; } } return 0; } static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX + 1]; struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; __be32 src, grp; u32 tableid; int err; err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); if (!mrt) { err = -ENOENT; goto errout_free; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ipmr_cache_find(mrt, src, grp); rcu_read_unlock(); if (!cache) { err = -ENOENT; goto errout_free; } skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout_free; } err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) goto errout_free; err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; errout_free: kfree_skb(skb); goto errout; } static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = {}; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) return skb->len; NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, _ipmr_fill_mroute, &mfc_unres_lock, &filter); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { [RTA_SRC] = { .type = NLA_U32 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, }; static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol) { switch (rtm_protocol) { case RTPROT_STATIC: case RTPROT_MROUTED: return true; } return false; } static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc) { struct rtnexthop *rtnh = nla_data(nla); int remaining = nla_len(nla), vifi = 0; while (rtnh_ok(rtnh, remaining)) { mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops; if (++vifi == MAXVIFS) break; rtnh = rtnh_next(rtnh, &remaining); } return remaining > 0 ? -EINVAL : vifi; } /* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct mfcctl *mfcc, int *mrtsock, struct mr_table **mrtret, struct netlink_ext_ack *extack) { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); ret = -EINVAL; if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 || rtm->rtm_type != RTN_MULTICAST || rtm->rtm_scope != RT_SCOPE_UNIVERSE || !ipmr_rtm_validate_proto(rtm->rtm_protocol)) goto out; memset(mfcc, 0, sizeof(*mfcc)); mfcc->mfcc_parent = -1; ret = 0; nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) { switch (nla_type(attr)) { case RTA_SRC: mfcc->mfcc_origin.s_addr = nla_get_be32(attr); break; case RTA_DST: mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: dev = __dev_get_by_index(net, nla_get_u32(attr)); if (!dev) { ret = -ENODEV; goto out; } break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { ret = -EINVAL; goto out; } break; case RTA_PREFSRC: ret = 1; break; case RTA_TABLE: tblid = nla_get_u32(attr); break; } } mrt = ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; goto out; } *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; if (dev) mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); out: return ret; } /* takes care of both newroute and delroute */ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int ret, mrtsock, parent; struct mr_table *tbl; struct mfcctl mfcc; mrtsock = 0; tbl = NULL; ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; parent = ret ? mfcc.mfcc_parent : -1; if (nlh->nlmsg_type == RTM_NEWROUTE) return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); else return ipmr_mfc_delete(tbl, &mfcc, parent); } static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) { u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len); if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, mrt->mroute_do_wrvifwhole)) return false; return true; } static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) { struct net_device *vif_dev; struct nlattr *vif_nest; struct vif_device *vif; vif = &mrt->vif_table[vifid]; vif_dev = rtnl_dereference(vif->dev); /* if the VIF doesn't exist just continue */ if (!vif_dev) return true; vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, IPMRA_VIFA_PAD) || nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { nla_nest_cancel(skb, vif_nest); return false; } nla_nest_end(skb, vif_nest); return true; } static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); return -EINVAL; } return 0; } static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh = NULL; unsigned int t = 0, s_t; unsigned int e = 0, s_e; struct mr_table *mrt; if (cb->strict_check) { int err = ipmr_valid_dumplink(cb->nlh, cb->extack); if (err < 0) return err; } s_t = cb->args[0]; s_e = cb->args[1]; ipmr_for_each_table(mrt, net) { struct nlattr *vifs, *af; struct ifinfomsg *hdr; u32 i; if (t < s_t) goto skip_table; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWLINK, sizeof(*hdr), NLM_F_MULTI); if (!nlh) break; hdr = nlmsg_data(nlh); memset(hdr, 0, sizeof(*hdr)); hdr->ifi_family = RTNL_FAMILY_IPMR; af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) { nlmsg_cancel(skb, nlh); goto out; } if (!ipmr_fill_table(mrt, skb)) { nlmsg_cancel(skb, nlh); goto out; } vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS); if (!vifs) { nla_nest_end(skb, af); nlmsg_end(skb, nlh); goto out; } for (i = 0; i < mrt->maxvif; i++) { if (e < s_e) goto skip_entry; if (!ipmr_fill_vif(mrt, i, skb)) { nla_nest_end(skb, vifs); nla_nest_end(skb, af); nlmsg_end(skb, nlh); goto out; } skip_entry: e++; } s_e = 0; e = 0; nla_nest_end(skb, vifs); nla_nest_end(skb, af); nlmsg_end(skb, nlh); skip_table: t++; } out: cb->args[1] = e; cb->args[0] = t; return skb->len; } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); } return 0; } static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, .next = mr_vif_seq_next, .stop = ipmr_vif_seq_stop, .show = ipmr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group Origin Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; seq_printf(seq, "%08X %08X %-3hd", (__force u32) mfc->mfc_mcastgrp, (__force u32) mfc->mfc_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->_c.mfc_un.res.pkt, mfc->_c.mfc_un.res.bytes, mfc->_c.mfc_un.res.wrong_if); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, }; #endif static unsigned int ipmr_seq_read(struct net *net) { ASSERT_RTNL(); return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, ipmr_mr_table_iter, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { .family = RTNL_FAMILY_IPMR, .fib_seq_read = ipmr_seq_read, .fib_dump = ipmr_dump, .owner = THIS_MODULE, }; static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.ipmr_seq = 0; ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.ipmr_notifier_ops = ops; return 0; } static void __net_exit ipmr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); net->ipv4.ipmr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; err = ipmr_notifier_init(net); if (err) goto ipmr_notifier_fail; err = ipmr_rules_init(net); if (err < 0) goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ipmr_rules_exit(net); rtnl_unlock(); #endif ipmr_rules_fail: ipmr_notifier_exit(net); ipmr_notifier_fail: return err; } static void __net_exit ipmr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif ipmr_notifier_exit(net); } static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ipmr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, .exit_batch = ipmr_net_exit_batch, }; int __init ip_mr_init(void) { int err; mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); err = register_pernet_subsys(&ipmr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IP_PIMSM_V2 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, ipmr_rtm_getroute, ipmr_rtm_dumproute, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE, ipmr_rtm_route, NULL, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE, ipmr_rtm_route, NULL, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK, NULL, ipmr_rtm_dumplink, 0); return 0; #ifdef CONFIG_IP_PIMSM_V2 add_proto_fail: unregister_netdevice_notifier(&ip_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ipmr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } |
3052 3052 57 57 3051 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkg - Per (cgroup, request queue) data. * @pd: blkg_policy_data structure. */ struct ioprio_blkg { struct blkg_policy_data pd; }; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; } static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) { struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); if (!pd) return NULL; return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkg_policy_data * ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct ioprio_blkg *ioprio_blkg; ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); if (!ioprio_blkg) return NULL; return &ioprio_blkg->pd; } static void ioprio_free_pd(struct blkg_policy_data *pd) { struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); kfree(ioprio_blkg); } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } #define IOPRIO_ATTRS \ { \ .name = "prio.class", \ .seq_show = ioprio_show_prio_policy, \ .write = ioprio_set_prio_policy, \ }, \ { } /* sentinel */ /* cgroup v2 attributes */ static struct cftype ioprio_files[] = { IOPRIO_ATTRS }; /* cgroup v1 attributes */ static struct cftype ioprio_legacy_files[] = { IOPRIO_ATTRS }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_legacy_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, .pd_alloc_fn = ioprio_alloc_pd, .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } void blk_ioprio_exit(struct gendisk *disk) { blkcg_deactivate_policy(disk, &ioprio_policy); } int blk_ioprio_init(struct gendisk *disk) { return blkcg_activate_policy(disk, &ioprio_policy); } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit); |
2 1498 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | // SPDX-License-Identifier: GPL-2.0-or-later /* * "LAPB via ethernet" driver release 001 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * This is a "pseudo" network driver to allow LAPB over Ethernet. * * This driver can use any ethernet destination address, and can be * limited to accept frames from one dedicated ethernet card only. * * History * LAPBETH 001 Jonathan Naylor Cloned from bpqether.c * 2000-10-29 Henner Eisen lapb_data_indication() return status. * 2000-11-14 Henner Eisen dev_hold/put, NETDEV_GOING_DOWN support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/lapb.h> #include <linux/init.h> #include <net/x25device.h> static const u8 bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; /* If this number is made larger, check that the temporary string buffer * in lapbeth_new_device is large enough to store the probe device name. */ #define MAXLAPBDEV 100 struct lapbethdev { struct list_head node; struct net_device *ethdev; /* link to ethernet device */ struct net_device *axdev; /* lapbeth device (lapb#) */ bool up; spinlock_t up_lock; /* Protects "up" */ struct sk_buff_head rx_queue; struct napi_struct napi; }; static LIST_HEAD(lapbeth_devices); static void lapbeth_connected(struct net_device *dev, int reason); static void lapbeth_disconnected(struct net_device *dev, int reason); /* ------------------------------------------------------------------------ */ /* Get the LAPB device for the ethernet device */ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) { struct lapbethdev *lapbeth; list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node, lockdep_rtnl_is_held()) { if (lapbeth->ethdev == dev) return lapbeth; } return NULL; } static __inline__ int dev_is_ethdev(struct net_device *dev) { return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); } /* ------------------------------------------------------------------------ */ static int lapbeth_napi_poll(struct napi_struct *napi, int budget) { struct lapbethdev *lapbeth = container_of(napi, struct lapbethdev, napi); struct sk_buff *skb; int processed = 0; for (; processed < budget; ++processed) { skb = skb_dequeue(&lapbeth->rx_queue); if (!skb) break; netif_receive_skb_core(skb); } if (processed < budget) napi_complete(napi); return processed; } /* Receive a LAPB frame via an ethernet interface. */ static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; if (dev_net(dev) != &init_net) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, 2)) goto drop; rcu_read_lock(); lapbeth = lapbeth_get_x25_dev(dev); if (!lapbeth) goto drop_unlock_rcu; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop_unlock; len = skb->data[0] + skb->data[1] * 256; dev->stats.rx_packets++; dev->stats.rx_bytes += len; skb_pull(skb, 2); /* Remove the length bytes */ skb_trim(skb, len); /* Set the length of the data */ err = lapb_data_received(lapbeth->axdev, skb); if (err != LAPB_OK) { printk(KERN_DEBUG "lapbether: lapb_data_received err - %d\n", err); goto drop_unlock; } out: spin_unlock_bh(&lapbeth->up_lock); rcu_read_unlock(); return 0; drop_unlock: kfree_skb(skb); goto out; drop_unlock_rcu: rcu_read_unlock(); drop: kfree_skb(skb); return 0; } static int lapbeth_data_indication(struct net_device *dev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; if (skb_cow(skb, 1)) { kfree_skb(skb); return NET_RX_DROP; } skb_push(skb, 1); ptr = skb->data; *ptr = X25_IFACE_DATA; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); return NET_RX_SUCCESS; } /* Send a LAPB frame via an ethernet interface */ static netdev_tx_t lapbeth_xmit(struct sk_buff *skb, struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop; /* There should be a pseudo header of 1 byte added by upper layers. * Check to make sure it is there before reading it. */ if (skb->len < 1) goto drop; switch (skb->data[0]) { case X25_IFACE_DATA: break; case X25_IFACE_CONNECT: err = lapb_connect_request(dev); if (err == LAPB_CONNECTED) lapbeth_connected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_connect_request error: %d\n", err); goto drop; case X25_IFACE_DISCONNECT: err = lapb_disconnect_request(dev); if (err == LAPB_NOTCONNECTED) lapbeth_disconnected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_disconnect_request err: %d\n", err); fallthrough; default: goto drop; } skb_pull(skb, 1); err = lapb_data_request(dev, skb); if (err != LAPB_OK) { pr_err("lapb_data_request error - %d\n", err); goto drop; } out: spin_unlock_bh(&lapbeth->up_lock); return NETDEV_TX_OK; drop: kfree_skb(skb); goto out; } static void lapbeth_data_transmit(struct net_device *ndev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(ndev); unsigned char *ptr; struct net_device *dev; int size = skb->len; ptr = skb_push(skb, 2); *ptr++ = size % 256; *ptr++ = size / 256; ndev->stats.tx_packets++; ndev->stats.tx_bytes += size; skb->dev = dev = lapbeth->ethdev; skb->protocol = htons(ETH_P_DEC); skb_reset_network_header(skb); dev_hard_header(skb, dev, ETH_P_DEC, bcast_addr, NULL, 0); dev_queue_xmit(skb); } static void lapbeth_connected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } static void lapbeth_disconnected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_DISCONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } /* Set AX.25 callsign */ static int lapbeth_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; dev_addr_set(dev, sa->sa_data); return 0; } static const struct lapb_register_struct lapbeth_callbacks = { .connect_confirmation = lapbeth_connected, .connect_indication = lapbeth_connected, .disconnect_confirmation = lapbeth_disconnected, .disconnect_indication = lapbeth_disconnected, .data_indication = lapbeth_data_indication, .data_transmit = lapbeth_data_transmit, }; /* open/close a device */ static int lapbeth_open(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; napi_enable(&lapbeth->napi); err = lapb_register(dev, &lapbeth_callbacks); if (err != LAPB_OK) { napi_disable(&lapbeth->napi); pr_err("lapb_register error: %d\n", err); return -ENODEV; } spin_lock_bh(&lapbeth->up_lock); lapbeth->up = true; spin_unlock_bh(&lapbeth->up_lock); return 0; } static int lapbeth_close(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); lapbeth->up = false; spin_unlock_bh(&lapbeth->up_lock); err = lapb_unregister(dev); if (err != LAPB_OK) pr_err("lapb_unregister error: %d\n", err); napi_disable(&lapbeth->napi); return 0; } /* ------------------------------------------------------------------------ */ static const struct net_device_ops lapbeth_netdev_ops = { .ndo_open = lapbeth_open, .ndo_stop = lapbeth_close, .ndo_start_xmit = lapbeth_xmit, .ndo_set_mac_address = lapbeth_set_mac_address, }; static void lapbeth_setup(struct net_device *dev) { dev->netdev_ops = &lapbeth_netdev_ops; dev->needs_free_netdev = true; dev->type = ARPHRD_X25; dev->hard_header_len = 0; dev->mtu = 1000; dev->addr_len = 0; } /* Setup a new device. */ static int lapbeth_new_device(struct net_device *dev) { struct net_device *ndev; struct lapbethdev *lapbeth; int rc = -ENOMEM; ASSERT_RTNL(); if (dev->type != ARPHRD_ETHER) return -EINVAL; ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", NET_NAME_UNKNOWN, lapbeth_setup); if (!ndev) goto out; /* When transmitting data: * first this driver removes a pseudo header of 1 byte, * then the lapb module prepends an LAPB header of at most 3 bytes, * then this driver prepends a length field of 2 bytes, * then the underlying Ethernet device prepends its own header. */ ndev->needed_headroom = -1 + 3 + 2 + dev->hard_header_len + dev->needed_headroom; ndev->needed_tailroom = dev->needed_tailroom; lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; dev_hold(dev); lapbeth->ethdev = dev; lapbeth->up = false; spin_lock_init(&lapbeth->up_lock); skb_queue_head_init(&lapbeth->rx_queue); netif_napi_add_weight(ndev, &lapbeth->napi, lapbeth_napi_poll, 16); rc = -EIO; if (register_netdevice(ndev)) goto fail; list_add_rcu(&lapbeth->node, &lapbeth_devices); rc = 0; out: return rc; fail: dev_put(dev); free_netdev(ndev); goto out; } /* Free a lapb network device. */ static void lapbeth_free_device(struct lapbethdev *lapbeth) { dev_put(lapbeth->ethdev); list_del_rcu(&lapbeth->node); unregister_netdevice(lapbeth->axdev); } /* Handle device status changes. * * Called from notifier with RTNL held. */ static int lapbeth_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct lapbethdev *lapbeth; struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (!dev_is_ethdev(dev) && !lapbeth_get_x25_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: /* New ethernet device -> new LAPB interface */ if (!lapbeth_get_x25_dev(dev)) lapbeth_new_device(dev); break; case NETDEV_GOING_DOWN: /* ethernet device closes -> close LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) dev_close(lapbeth->axdev); break; case NETDEV_UNREGISTER: /* ethernet device disappears -> remove LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) lapbeth_free_device(lapbeth); break; } return NOTIFY_DONE; } /* ------------------------------------------------------------------------ */ static struct packet_type lapbeth_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DEC), .func = lapbeth_rcv, }; static struct notifier_block lapbeth_dev_notifier = { .notifier_call = lapbeth_device_event, }; static const char banner[] __initconst = KERN_INFO "LAPB Ethernet driver version 0.02\n"; static int __init lapbeth_init_driver(void) { dev_add_pack(&lapbeth_packet_type); register_netdevice_notifier(&lapbeth_dev_notifier); printk(banner); return 0; } module_init(lapbeth_init_driver); static void __exit lapbeth_cleanup_driver(void) { struct lapbethdev *lapbeth; struct list_head *entry, *tmp; dev_remove_pack(&lapbeth_packet_type); unregister_netdevice_notifier(&lapbeth_dev_notifier); rtnl_lock(); list_for_each_safe(entry, tmp, &lapbeth_devices) { lapbeth = list_entry(entry, struct lapbethdev, node); dev_put(lapbeth->ethdev); unregister_netdevice(lapbeth->axdev); } rtnl_unlock(); } module_exit(lapbeth_cleanup_driver); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The unofficial LAPB over Ethernet driver"); MODULE_LICENSE("GPL"); |
106 106 106 1503 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 | // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include "netdevsim.h" #define NSIM_DEV_HWSTATS_TRAFFIC_MS 100 static struct list_head * nsim_dev_hwstats_get_list_head(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return &hwstats->l3_list; } WARN_ON_ONCE(1); return NULL; } static void nsim_dev_hwstats_traffic_bump(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->enabled) { hwsdev->stats.rx_packets += 1; hwsdev->stats.tx_packets += 2; hwsdev->stats.rx_bytes += 100; hwsdev->stats.tx_bytes += 300; } } } static void nsim_dev_hwstats_traffic_work(struct work_struct *work) { struct nsim_dev_hwstats *hwstats; hwstats = container_of(work, struct nsim_dev_hwstats, traffic_dw.work); mutex_lock(&hwstats->hwsdev_list_lock); nsim_dev_hwstats_traffic_bump(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); } static struct nsim_dev_hwstats_netdev * nsim_dev_hwslist_find_hwsdev(struct list_head *hwsdev_list, int ifindex) { struct nsim_dev_hwstats_netdev *hwsdev; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->netdev->ifindex == ifindex) return hwsdev; } return NULL; } static int nsim_dev_hwsdev_enable(struct nsim_dev_hwstats_netdev *hwsdev, struct netlink_ext_ack *extack) { if (hwsdev->fail_enable) { hwsdev->fail_enable = false; NL_SET_ERR_MSG_MOD(extack, "Stats enablement set to fail"); return -ECANCELED; } hwsdev->enabled = true; return 0; } static void nsim_dev_hwsdev_disable(struct nsim_dev_hwstats_netdev *hwsdev) { hwsdev->enabled = false; memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); } static int nsim_dev_hwsdev_report_delta(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { netdev_offload_xstats_report_delta(info->report_delta, &hwsdev->stats); memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); return 0; } static void nsim_dev_hwsdev_report_used(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { if (hwsdev->enabled) netdev_offload_xstats_report_used(info->report_used); } static int nsim_dev_hwstats_event_off_xstats(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_offload_xstats_info *info; struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; int err = 0; info = ptr; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, info->type); if (!hwsdev_list) return 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) goto out; switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: err = nsim_dev_hwsdev_enable(hwsdev, info->info.extack); break; case NETDEV_OFFLOAD_XSTATS_DISABLE: nsim_dev_hwsdev_disable(hwsdev); break; case NETDEV_OFFLOAD_XSTATS_REPORT_USED: nsim_dev_hwsdev_report_used(hwsdev, info); break; case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: err = nsim_dev_hwsdev_report_delta(hwsdev, info); break; } out: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_fini(struct nsim_dev_hwstats_netdev *hwsdev) { dev_put(hwsdev->netdev); kfree(hwsdev); } static void __nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) return; list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } static void nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev) { mutex_lock(&hwstats->hwsdev_list_lock); __nsim_dev_hwstats_event_unregister(hwstats, dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); } static int nsim_dev_hwstats_event(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: return nsim_dev_hwstats_event_off_xstats(hwstats, dev, event, ptr); case NETDEV_UNREGISTER: nsim_dev_hwstats_event_unregister(hwstats, dev); break; } return 0; } static int nsim_dev_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nsim_dev_hwstats *hwstats; int err = 0; hwstats = container_of(nb, struct nsim_dev_hwstats, netdevice_nb); err = nsim_dev_hwstats_event(hwstats, dev, event, ptr); if (err) return notifier_from_errno(err); return NOTIFY_OK; } static int nsim_dev_hwstats_enable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; struct nsim_dev *nsim_dev; struct net_device *netdev; bool notify = false; struct net *net; int err = 0; nsim_dev = container_of(hwstats, struct nsim_dev, hwstats); net = nsim_dev_net(nsim_dev); rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) goto out_unlock_list; netdev = dev_get_by_index(net, ifindex); if (!netdev) { err = -ENODEV; goto out_unlock_list; } hwsdev = kzalloc(sizeof(*hwsdev), GFP_KERNEL); if (!hwsdev) { err = -ENOMEM; goto out_put_netdev; } hwsdev->netdev = netdev; list_add_tail(&hwsdev->list, hwsdev_list); mutex_unlock(&hwstats->hwsdev_list_lock); if (netdev_offload_xstats_enabled(netdev, type)) { nsim_dev_hwsdev_enable(hwsdev, NULL); notify = true; } if (notify) rtnl_offload_xstats_notify(netdev); rtnl_unlock(); return err; out_put_netdev: dev_put(netdev); out_unlock_list: mutex_unlock(&hwstats->hwsdev_list_lock); rtnl_unlock(); return err; } static int nsim_dev_hwstats_disable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) list_del(&hwsdev->list); mutex_unlock(&hwstats->hwsdev_list_lock); if (!hwsdev) { err = -ENOENT; goto unlock_out; } if (netdev_offload_xstats_enabled(hwsdev->netdev, type)) { netdev_offload_xstats_push_delta(hwsdev->netdev, type, &hwsdev->stats); rtnl_offload_xstats_notify(hwsdev->netdev); } nsim_dev_hwsdev_fini(hwsdev); unlock_out: rtnl_unlock(); return err; } static int nsim_dev_hwstats_fail_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (!hwsdev) { err = -ENOENT; goto err_hwsdev_list_unlock; } hwsdev->fail_enable = true; err_hwsdev_list_unlock: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } enum nsim_dev_hwstats_do { NSIM_DEV_HWSTATS_DO_DISABLE, NSIM_DEV_HWSTATS_DO_ENABLE, NSIM_DEV_HWSTATS_DO_FAIL, }; struct nsim_dev_hwstats_fops { const struct file_operations fops; enum nsim_dev_hwstats_do action; enum netdev_offload_xstats_type type; }; static ssize_t nsim_dev_hwstats_do_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev_hwstats *hwstats = file->private_data; struct nsim_dev_hwstats_fops *hwsfops; struct list_head *hwsdev_list; int ifindex; int err; hwsfops = container_of(debugfs_real_fops(file), struct nsim_dev_hwstats_fops, fops); err = kstrtoint_from_user(data, count, 0, &ifindex); if (err) return err; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, hwsfops->type); if (WARN_ON(!hwsdev_list)) return -EINVAL; switch (hwsfops->action) { case NSIM_DEV_HWSTATS_DO_DISABLE: err = nsim_dev_hwstats_disable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_ENABLE: err = nsim_dev_hwstats_enable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_FAIL: err = nsim_dev_hwstats_fail_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; } if (err) return err; return count; } #define NSIM_DEV_HWSTATS_FOPS(ACTION, TYPE) \ { \ .fops = { \ .open = simple_open, \ .write = nsim_dev_hwstats_do_write, \ .llseek = generic_file_llseek, \ .owner = THIS_MODULE, \ }, \ .action = ACTION, \ .type = TYPE, \ } static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_disable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_DISABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_enable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_ENABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_fail_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_FAIL, NETDEV_OFFLOAD_XSTATS_TYPE_L3); #undef NSIM_DEV_HWSTATS_FOPS int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); int err; mutex_init(&hwstats->hwsdev_list_lock); INIT_LIST_HEAD(&hwstats->l3_list); hwstats->netdevice_nb.notifier_call = nsim_dev_netdevice_event; err = register_netdevice_notifier_net(net, &hwstats->netdevice_nb); if (err) goto err_mutex_destroy; hwstats->ddir = debugfs_create_dir("hwstats", nsim_dev->ddir); if (IS_ERR(hwstats->ddir)) { err = PTR_ERR(hwstats->ddir); goto err_unregister_notifier; } hwstats->l3_ddir = debugfs_create_dir("l3", hwstats->ddir); if (IS_ERR(hwstats->l3_ddir)) { err = PTR_ERR(hwstats->l3_ddir); goto err_remove_hwstats_recursive; } debugfs_create_file("enable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_enable_fops.fops); debugfs_create_file("disable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_disable_fops.fops); debugfs_create_file("fail_next_enable", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_fail_fops.fops); INIT_DELAYED_WORK(&hwstats->traffic_dw, &nsim_dev_hwstats_traffic_work); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); return 0; err_remove_hwstats_recursive: debugfs_remove_recursive(hwstats->ddir); err_unregister_notifier: unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); err_mutex_destroy: mutex_destroy(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_list_wipe(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev, *tmp; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; mutex_lock(&hwstats->hwsdev_list_lock); list_for_each_entry_safe(hwsdev, tmp, hwsdev_list, list) { list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } mutex_unlock(&hwstats->hwsdev_list_lock); } void nsim_dev_hwstats_exit(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); cancel_delayed_work_sync(&hwstats->traffic_dw); debugfs_remove_recursive(hwstats->ddir); unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); nsim_dev_hwsdev_list_wipe(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_destroy(&hwstats->hwsdev_list_lock); } |
5 2 2 2 2 2 22 1 23 23 23 5 5 5 5 5 23 1 1 1 1 1 1 1 1 23 23 23 23 23 23 23 23 23 5 2 2 2 5 7 5 2 7 23 23 23 23 23 1343 1009 1345 1345 904 1344 23 1347 1347 812 1346 212 212 240 1295 1295 82 116 69 116 116 116 49 49 46 46 1 1 15 15 2 1 2 215 216 216 157 215 216 216 197 50 139 129 128 128 128 139 8 7 7 8 32 31 30 27 17 18 1 18 32 6 6 5 6 26 332 116 216 330 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 | // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_policy.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor * */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> #include <linux/if_tunnel.h> #include <net/dst.h> #include <net/flow.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/gre.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #ifdef CONFIG_XFRM_ESPINTCP #include <net/espintcp.h> #endif #include "xfrm_hash.h" #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 struct xfrm_flo { struct dst_entry *dst_orig; u8 flags; }; /* prefixes smaller than this are stored in lists, not trees. */ #define INEXACT_PREFIXLEN_IPV4 16 #define INEXACT_PREFIXLEN_IPV6 48 struct xfrm_pol_inexact_node { struct rb_node node; union { xfrm_address_t addr; struct rcu_head rcu; }; u8 prefixlen; struct rb_root root; /* the policies matching this node, can be empty list */ struct hlist_head hhead; }; /* xfrm inexact policy search tree: * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); * | * +---- root_d: sorted by daddr:prefix * | | * | xfrm_pol_inexact_node * | | * | +- root: sorted by saddr/prefix * | | | * | | xfrm_pol_inexact_node * | | | * | | + root: unused * | | | * | | + hhead: saddr:daddr policies * | | * | +- coarse policies and all any:daddr policies * | * +---- root_s: sorted by saddr:prefix * | | * | xfrm_pol_inexact_node * | | * | + root: unused * | | * | + hhead: saddr:any policies * | * +---- coarse policies and all any:any policies * * Lookups return four candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree * 3. saddr:daddr list from 2nd level daddr tree * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two results have same prio, youngest one wins. */ struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; u16 family; u8 dir, type; }; struct xfrm_pol_inexact_bin { struct xfrm_pol_inexact_key k; struct rhash_head head; /* list containing '*:*' policies */ struct hlist_head hhead; seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; /* tree sorted by saddr/prefix */ struct rb_root root_s; /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { XFRM_POL_CAND_BOTH, XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, XFRM_POL_CAND_MAX, }; struct xfrm_pol_inexact_candidates { struct hlist_head *res[XFRM_POL_CAND_MAX]; }; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr); static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); } static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); } static inline bool __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && (fl6->flowi6_proto == sel->proto || !sel->proto) && (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_selector_match(sel, fl); case AF_INET6: return __xfrm6_selector_match(sel, fl); } return false; } static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { const struct xfrm_policy_afinfo *afinfo; if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } /* Called with rcu_read_lock(). */ static const struct xfrm_if_cb *xfrm_if_get_cb(void) { return rcu_dereference(xfrm_if_cb); } struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family, u32 mark) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); rcu_read_unlock(); return dst; } EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; struct dst_entry *dst; if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { saddr = x->coaddr; daddr = prev_daddr; } if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { saddr = prev_saddr; daddr = x->coaddr; } dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); if (prev_daddr != daddr) memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); } return dst; } static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) return MAX_SCHEDULE_TIMEOUT-1; else return secs*HZ; } static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = from_timer(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int dir; read_lock(&xp->lock); if (unlikely(xp->walk.dead)) goto out; dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.soft_add_expires_seconds) { time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (warn) km_policy_expired(xp, dir, 0, 0); if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); out: read_unlock(&xp->lock); xfrm_pol_put(xp); return; expired: read_unlock(&xp->lock); if (!xfrm_policy_delete(xp, dir)) km_policy_expired(xp, dir, 1, 0); xfrm_pol_put(xp); } /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; policy = kzalloc(sizeof(struct xfrm_policy), gfp); if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); timer_setup(&policy->timer, xfrm_policy_timer, 0); timer_setup(&policy->polq.hold_timer, xfrm_policy_queue_process, 0); } return policy; } EXPORT_SYMBOL(xfrm_policy_alloc); static void xfrm_policy_destroy_rcu(struct rcu_head *head) { struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); security_xfrm_policy_free(policy->security); kfree(policy); } /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); /* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ static void xfrm_policy_kill(struct xfrm_policy *policy) { write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); xfrm_pol_put(policy); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int idx_hash(struct net *net, u32 index) { return __idx_hash(index, net->xfrm.policy_idx_hmask); } /* calculate policy hash thresholds */ static void __get_hash_thresh(struct net *net, unsigned short family, int dir, u8 *dbits, u8 *sbits) { switch (family) { case AF_INET: *dbits = net->xfrm.policy_bydst[dir].dbits4; *sbits = net->xfrm.policy_bydst[dir].sbits4; break; case AF_INET6: *dbits = net->xfrm.policy_bydst[dir].dbits6; *sbits = net->xfrm.policy_bydst[dir].sbits6; break; default: *dbits = 0; *sbits = 0; } } static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask, int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; u8 dbits; u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; hlist_del_rcu(&pol->bydst); hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; goto redo; } } static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_policy *pol; hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); hlist_add_head(&pol->byidx, nidxtable+h); } } static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) { return ((old_hmask + 1) << 1) - 1; } static void xfrm_bydst_resize(struct net *net, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *ndst = xfrm_hash_alloc(nsize); struct hlist_head *odst; int i; if (!ndst) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } static void xfrm_byidx_resize(struct net *net) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *oidx = net->xfrm.policy_byidx; struct hlist_head *nidx = xfrm_hash_alloc(nsize); int i; if (!nidx) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) { unsigned int cnt = net->xfrm.policy_count[dir]; unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; if (total) *total += cnt; if ((hmask + 1) < xfrm_policy_hashmax && cnt > hmask) return 1; return 0; } static inline int xfrm_byidx_should_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; if ((hmask + 1) < xfrm_policy_hashmax && total > hmask) return 1; return 0; } void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; } EXPORT_SYMBOL(xfrm_spd_getinfo); static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hash_work); int dir, total; mutex_lock(&hash_resize_mutex); total = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) xfrm_byidx_resize(net); mutex_unlock(&hash_resize_mutex); } /* Make sure *pol can be inserted into fastbin. * Useful to check that later insert requests will be successful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) { struct xfrm_pol_inexact_bin *bin, *prev; struct xfrm_pol_inexact_key k = { .family = pol->family, .type = pol->type, .dir = dir, .if_id = pol->if_id, }; struct net *net = xp_net(pol); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); write_pnet(&k.net, net); bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); if (bin) return bin; bin = kzalloc(sizeof(*bin), GFP_ATOMIC); if (!bin) return NULL; bin->k = k; INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, xfrm_pol_inexact_params); if (!prev) { list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); return bin; } kfree(bin); return IS_ERR(prev) ? NULL : prev; } static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, int family, u8 prefixlen) { if (xfrm_addr_any(addr, family)) return true; if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) return true; if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) return true; return false; } static bool xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) { const xfrm_address_t *addr; bool saddr_any, daddr_any; u8 prefixlen; addr = &policy->selector.saddr; prefixlen = policy->selector.prefixlen_s; saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); addr = &policy->selector.daddr; prefixlen = policy->selector.prefixlen_d; daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); return saddr_any && daddr_any; } static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, const xfrm_address_t *addr, u8 prefixlen) { node->addr = *addr; node->prefixlen = prefixlen; } static struct xfrm_pol_inexact_node * xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) { struct xfrm_pol_inexact_node *node; node = kzalloc(sizeof(*node), GFP_ATOMIC); if (node) xfrm_pol_inexact_node_init(node, addr, prefixlen); return node; } static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: if (prefixlen == 0) return 0; mask = ~0U << (32 - prefixlen); ma = ntohl(a->a4) & mask; mb = ntohl(b->a4) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; if (pdw) { delta = memcmp(a->a6, b->a6, pdw << 2); if (delta) return delta; } if (pbi) { mask = ~0U << (32 - pbi); ma = ntohl(a->a6[pdw]) & mask; mb = ntohl(b->a6[pdw]) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; } break; default: break; } return delta; } static void xfrm_policy_inexact_list_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, u16 family) { unsigned int matched_s, matched_d; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { struct hlist_node *newpos = NULL; bool matches_s, matches_d; if (!policy->bydst_reinsert) continue; WARN_ON_ONCE(policy->family != family); policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { if (policy->priority > p->priority) newpos = &p->bydst; else if (policy->priority == p->priority && policy->pos > p->pos) newpos = &p->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least * saddr or daddr for current node prefix. * * Matching both is fine, matching saddr in one policy * (but not daddr) and then matching only daddr in another * is a bug. */ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, &n->addr, n->prefixlen, family) == 0; matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, &n->addr, n->prefixlen, family) == 0; if (matches_s && matches_d) continue; WARN_ON_ONCE(!matches_s && !matches_d); if (matches_s) matched_s++; if (matches_d) matched_d++; WARN_ON_ONCE(matched_s && matched_d); } } static void xfrm_policy_inexact_node_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, struct rb_root *new, u16 family) { struct xfrm_pol_inexact_node *node; struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); restart: parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); prefixlen = min(node->prefixlen, n->prefixlen); delta = xfrm_policy_addr_delta(&n->addr, &node->addr, prefixlen, family); if (delta < 0) { p = &parent->rb_left; } else if (delta > 0) { p = &parent->rb_right; } else { bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } node->prefixlen = prefixlen; xfrm_policy_inexact_list_reinsert(net, node, family); if (same_prefixlen) { kfree_rcu(n, rcu); return; } rb_erase(*p, new); kfree_rcu(n, rcu); n = node; goto restart; } } rb_link_node_rcu(&n->node, parent, p); rb_insert_color(&n->node, new); } /* merge nodes v and n */ static void xfrm_policy_inexact_node_merge(struct net *net, struct xfrm_pol_inexact_node *v, struct xfrm_pol_inexact_node *n, u16 family) { struct xfrm_pol_inexact_node *node; struct xfrm_policy *tmp; struct rb_node *rnode; /* To-be-merged node v has a subtree. * * Dismantle it and insert its nodes to n->root. */ while ((rnode = rb_first(&v->root)) != NULL) { node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); rb_erase(&node->node, &v->root); xfrm_policy_inexact_node_reinsert(net, node, &n->root, family); } hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } xfrm_policy_inexact_list_reinsert(net, n, family); } static struct xfrm_pol_inexact_node * xfrm_policy_inexact_insert_node(struct net *net, struct rb_root *root, xfrm_address_t *addr, u16 family, u8 prefixlen, u8 dir) { struct xfrm_pol_inexact_node *cached = NULL; struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; p = &root->rb_node; while (*p) { int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta == 0 && prefixlen >= node->prefixlen) { WARN_ON_ONCE(cached); /* ipsec policies got lost */ return node; } if (delta < 0) p = &parent->rb_left; else p = &parent->rb_right; if (prefixlen < node->prefixlen) { delta = xfrm_policy_addr_delta(addr, &node->addr, prefixlen, family); if (delta) continue; /* This node is a subnet of the new prefix. It needs * to be removed and re-inserted with the smaller * prefix and all nodes that are now also covered * by the reduced prefixlen. */ rb_erase(&node->node, root); if (!cached) { xfrm_pol_inexact_node_init(node, addr, prefixlen); cached = node; } else { /* This node also falls within the new * prefixlen. Merge the to-be-reinserted * node and this one. */ xfrm_policy_inexact_node_merge(net, node, cached, family); kfree_rcu(node, rcu); } /* restart */ p = &root->rb_node; parent = NULL; } } node = cached; if (!node) { node = xfrm_pol_inexact_node_alloc(addr, prefixlen); if (!node) return NULL; } rb_link_node_rcu(&node->node, parent, p); rb_insert_color(&node->node, root); return node; } static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) { struct xfrm_pol_inexact_node *node; struct rb_node *rn = rb_first(r); while (rn) { node = rb_entry(rn, struct xfrm_pol_inexact_node, node); xfrm_policy_inexact_gc_tree(&node->root, rm); rn = rb_next(rn); if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { WARN_ON_ONCE(rm); continue; } rb_erase(&node->node, r); kfree_rcu(node, rcu); } } static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) { write_seqcount_begin(&b->count); xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); write_seqcount_end(&b->count); if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; } if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, xfrm_pol_inexact_params) == 0) { list_del(&b->inexact_bins); kfree_rcu(b, rcu); } } static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) { struct net *net = read_pnet(&b->k.net); spin_lock_bh(&net->xfrm.xfrm_policy_lock); __xfrm_policy_inexact_prune_bin(b, false); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static void __xfrm_policy_inexact_flush(struct net *net) { struct xfrm_pol_inexact_bin *bin, *t; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(bin, false); } static struct hlist_head * xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, struct xfrm_policy *policy, u8 dir) { struct xfrm_pol_inexact_node *n; struct net *net; net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, policy->selector.prefixlen_d)) { write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_s, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } /* daddr is fixed */ write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_d, &policy->selector.daddr, policy->family, policy->selector.prefixlen_d, dir); write_seqcount_end(&bin->count); if (!n) return NULL; /* saddr is wildcard */ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, policy->family, policy->selector.prefixlen_s)) return &n->hhead; write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &n->root, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } static struct xfrm_policy * xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) { struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *delpol; struct hlist_head *chain; struct net *net; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) return ERR_PTR(-ENOMEM); net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); if (!chain) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-ENOMEM); } delpol = xfrm_policy_insert_list(chain, policy, excl); if (delpol && excl) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-EEXIST); } chain = &net->xfrm.policy_inexact[dir]; xfrm_policy_insert_inexact_list(chain, policy); if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; struct hlist_head *odst; struct hlist_node *newpos; int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; mutex_lock(&hash_resize_mutex); /* read selector prefixlen thresholds */ do { seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); lbits4 = net->xfrm.policy_hthresh.lbits4; rbits4 = net->xfrm.policy_hthresh.rbits4; lbits6 = net->xfrm.policy_hthresh.lbits6; rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; dir = xfrm_policy_id2dir(policy->index); if (policy->walk.dead || dir >= XFRM_POLICY_MAX) continue; if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; sbits = lbits4; } else { dbits = rbits6; sbits = lbits6; } } else { if (policy->family == AF_INET) { dbits = lbits4; sbits = rbits4; } else { dbits = lbits6; sbits = rbits6; } } if (policy->selector.prefixlen_d < dbits || policy->selector.prefixlen_s < sbits) continue; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) goto out_unlock; if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) goto out_unlock; } /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct hlist_node *n; hlist_for_each_entry_safe(policy, n, &net->xfrm.policy_inexact[dir], bydst_inexact_list) { hlist_del_rcu(&policy->bydst); hlist_del_init(&policy->bydst_inexact_list); } hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; for (i = hmask; i >= 0; i--) { hlist_for_each_entry_safe(policy, n, odst + i, bydst) hlist_del_rcu(&policy->bydst); } if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; net->xfrm.policy_bydst[dir].sbits4 = lbits4; net->xfrm.policy_bydst[dir].dbits6 = rbits6; net->xfrm.policy_bydst[dir].sbits6 = lbits6; } else { /* dir in/fwd => dst = local, src = remote */ net->xfrm.policy_bydst[dir].dbits4 = lbits4; net->xfrm.policy_bydst[dir].sbits4 = rbits4; net->xfrm.policy_bydst[dir].dbits6 = lbits6; net->xfrm.policy_bydst[dir].sbits6 = rbits6; } } /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (policy->walk.dead) continue; dir = xfrm_policy_id2dir(policy->index); if (dir >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); continue; } hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); } out_unlock: __xfrm_policy_inexact_flush(net); write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } void xfrm_policy_hash_rebuild(struct net *net) { schedule_work(&net->xfrm.policy_hthresh.work); } EXPORT_SYMBOL(xfrm_policy_hash_rebuild); /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { static u32 idx_generator; for (;;) { struct hlist_head *list; struct xfrm_policy *p; u32 idx; int found; if (!index) { idx = (idx_generator | dir); idx_generator += 8; } else { idx = index; index = 0; } if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; } } if (!found) return idx; } } static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) { u32 *p1 = (u32 *) s1; u32 *p2 = (u32 *) s2; int len = sizeof(struct xfrm_selector) / sizeof(u32); int i; for (i = 0; i < len; i++) { if (p1[i] != p2[i]) return 1; } return 0; } static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy *new) { struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; if (skb_queue_empty(&pq->hold_queue)) return; __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); if (del_timer(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice(&list, &pq->hold_queue); pq->timeout = XFRM_QUEUE_TMO_MIN; if (!mod_timer(&pq->hold_timer, jiffies)) xfrm_pol_hold(new); spin_unlock_bh(&pq->hold_queue.lock); } static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, struct xfrm_policy *pol) { return mark->v == pol->mark.v && mark->m == pol->mark.m; } static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_key *k = data; u32 a = k->type << 24 | k->dir << 16 | k->family; return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), seed); } static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_bin *b = data; return xfrm_pol_bin_key(&b->k, 0, seed); } static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct xfrm_pol_inexact_key *key = arg->key; const struct xfrm_pol_inexact_bin *b = ptr; int ret; if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) return -1; ret = b->k.dir ^ key->dir; if (ret) return ret; ret = b->k.type ^ key->type; if (ret) return ret; ret = b->k.family ^ key->family; if (ret) return ret; return b->k.if_id ^ key->if_id; } static const struct rhashtable_params xfrm_pol_inexact_params = { .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), .hashfn = xfrm_pol_bin_key, .obj_hashfn = xfrm_pol_bin_obj, .obj_cmpfn = xfrm_pol_bin_cmp, .automatic_shrinking = true, }; static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy) { struct xfrm_policy *pol, *delpol = NULL; struct hlist_node *newpos = NULL; int i = 0; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = &pol->bydst_inexact_list; continue; } if (delpol) break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); hlist_for_each_entry(pol, chain, bydst_inexact_list) { pol->pos = i; i++; } } static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) { struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = pol; continue; } if (delpol) break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else /* Packet offload policies enter to the head * to speed-up lookups. */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; } int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *delpol; struct hlist_head *chain; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) delpol = xfrm_policy_insert_list(chain, policy, excl); else delpol = xfrm_policy_inexact_insert(policy, dir, excl); if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return PTR_ERR(delpol); } __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) rt_genid_bump_ipv4(net); else rt_genid_bump_ipv6(net); if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); else if (xfrm_bydst_should_resize(net, dir, NULL)) schedule_work(&net->xfrm.policy_hash_work); return 0; } EXPORT_SYMBOL(xfrm_policy_insert); static struct xfrm_policy * __xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx) { struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) return pol; } return NULL; } struct xfrm_policy * xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_pol_inexact_bin *bin = NULL; struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { struct xfrm_pol_inexact_candidates cand; int i; bin = xfrm_policy_inexact_lookup(net, type, sel->family, dir, if_id); if (!bin) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } if (!xfrm_policy_find_inexact_candidates(&cand, bin, &sel->saddr, &sel->daddr)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } pol = NULL; for (i = 0; i < ARRAY_SIZE(cand.res); i++) { struct xfrm_policy *tmp; tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, if_id, type, dir, sel, ctx); if (!tmp) continue; if (!pol || tmp->pos < pol->pos) pol = tmp; } } else { pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, sel, ctx); } if (pol) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete(pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); if (bin && delete) xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy * xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) return NULL; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->type != type) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { return 0; } #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { dir = xfrm_policy_id2dir(pol->index); if (pol->walk.dead || dir >= XFRM_POLICY_MAX || pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { dir = xfrm_policy_id2dir(pol->index); if (pol->walk.dead || dir >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_dev_policy_flush); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) { struct xfrm_policy *pol; struct xfrm_policy_walk_entry *x; int error = 0; if (walk->type >= XFRM_POLICY_TYPE_MAX && walk->type != XFRM_POLICY_TYPE_ANY) return -EINVAL; if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else x = list_first_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; pol = container_of(x, struct xfrm_policy, walk); if (walk->type != XFRM_POLICY_TYPE_ANY && walk->type != pol->type) continue; error = func(pol, xfrm_policy_id2dir(pol->index), walk->seq, data); if (error) { list_move_tail(&walk->walk.all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { error = -ENOENT; goto out; } list_del_init(&walk->walk.all); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) { INIT_LIST_HEAD(&walk->walk.all); walk->walk.dead = 1; walk->type = type; walk->seq = 0; } EXPORT_SYMBOL(xfrm_policy_walk_init); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); /* * Find policy to apply to this flow. * * Returns 0 if policy found, else an -errno. */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; match = xfrm_selector_match(sel, fl, family); if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; int seq; again: seq = read_seqcount_begin(count); parent = rcu_dereference_raw(r->rb_node); while (parent) { struct xfrm_pol_inexact_node *node; int delta; node = rb_entry(parent, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta < 0) { parent = rcu_dereference_raw(parent->rb_left); continue; } else if (delta > 0) { parent = rcu_dereference_raw(parent->rb_right); continue; } return node; } if (read_seqcount_retry(count, seq)) goto again; return NULL; } static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct xfrm_pol_inexact_node *n; u16 family; if (!b) return false; family = b->k.family; memset(cand, 0, sizeof(*cand)); cand->res[XFRM_POL_CAND_ANY] = &b->hhead; n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, family); if (n) { cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; } n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; return true; } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_key k = { .family = family, .type = type, .dir = dir, .if_id = if_id, }; write_pnet(&k.net, net); return rhashtable_lookup(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_bin *bin; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); rcu_read_lock(); bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); rcu_read_unlock(); return bin; } static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { int err; if (pol->priority > priority) break; err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); continue; } if (prefer) { /* matches. Is it older than *prefer? */ if (pol->priority == priority && prefer->pos < pol->pos) return prefer; } return pol; } return NULL; } static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { struct xfrm_policy *tmp; int i; for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, fl, type, family, if_id); if (!tmp) continue; if (IS_ERR(tmp)) return tmp; prefer = tmp; } return prefer; } static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; break; } } if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) goto skip_inexact; bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, family, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) goto fail; } skip_inexact: if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: rcu_read_unlock(); return ret; } static struct xfrm_policy *xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir, if_id); if (pol != NULL) return pol; #endif return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family, u32 if_id) { struct xfrm_policy *pol; rcu_read_lock(); again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match; int err = 0; if (pol->family != family) { pol = NULL; goto out; } match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; } else if (err == -ESRCH) { pol = NULL; } else { pol = ERR_PTR(err); } } else pol = NULL; } out: rcu_read_unlock(); return pol; } static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); if (list_empty(&pol->walk.all)) return NULL; /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) { __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); } static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) { __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); } int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_dev_policy_delete(pol); xfrm_policy_kill(pol); return 0; } return -ENOENT; } EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) return -EINVAL; #endif spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ xfrm_sk_policy_unlink(old_pol, dir); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); } return 0; } static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; if (security_xfrm_policy_clone(old->security, &newp->security)) { kfree(newp); return NULL; /* ENOMEM */ } newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { const struct xfrm_policy *p; struct xfrm_policy *np; int i, ret = 0; rcu_read_lock(); for (i = 0; i < 2; i++) { p = rcu_dereference(osk->sk_policy[i]); if (p) { np = clone_policy(p, i); if (unlikely(!np)) { ret = -ENOMEM; break; } rcu_assign_pointer(sk->sk_policy[i], np); } } rcu_read_unlock(); return ret; } static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, xfrm_address_t *remote, unsigned short family, u32 mark) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(net, oif, local, remote, mark); rcu_read_unlock(); return err; } /* Resolve list of templates for the flow, given policy. */ static int xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct net *net = xp_net(policy); int nx; int i, error; xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, tmpl->encap_family, 0); if (error) goto fail; local = &tmp; } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; daddr = remote; saddr = local; continue; } if (x) { error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); } else if (error == -ESRCH) { error = -EAGAIN; } if (!tmpl->optional) goto fail; } return nx; fail: for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } static int xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct xfrm_state *tp[XFRM_MAX_DEPTH]; struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; int cnx = 0; int error; int ret; int i; for (i = 0; i < npols; i++) { if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { error = -ENOBUFS; goto fail; } ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); if (ret < 0) { error = ret; goto fail; } else cnx += ret; } /* found states are sorted for outbound processing */ if (npols > 1) xfrm_state_sort(xfrm, tpp, cnx, family); return cnx; fail: for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; } static int xfrm_get_tos(const struct flowi *fl, int family) { if (family == AF_INET) return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; if (!afinfo) return ERR_PTR(-EINVAL); switch (family) { case AF_INET: dst_ops = &net->xfrm.xfrm4_dst_ops; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: dst_ops = &net->xfrm.xfrm6_dst_ops; break; #endif default: BUG(); } xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { memset_after(xdst, 0, u.dst); } else xdst = ERR_PTR(-ENOBUFS); rcu_read_unlock(); return xdst; } static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { struct rt6_info *rt = (struct rt6_info *)dst; path->path_cookie = rt6_get_cookie(rt); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->fill_dst(xdst, dev, fl); rcu_read_unlock(); return err; } /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, struct xfrm_dst **bundle, int nx, const struct flowi *fl, struct dst_entry *dst) { const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; int nfheader_len = 0; int trailer_len = 0; int tos; int family = policy->selector.family; xfrm_address_t saddr, daddr; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); tos = xfrm_get_tos(fl, family); dst_hold(dst); for (; i < nx; i++) { struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); struct dst_entry *dst1 = &xdst->u.dst; err = PTR_ERR(xdst); if (IS_ERR(xdst)) { dst_release(dst); goto put_states; } bundle[i] = xdst; if (!xdst_prev) xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); if (!inner_mode) { err = -EAFNOSUPPORT; dst_release(dst); goto put_states; } } else inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; int oif; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); family = xfrm[i]->props.family; oif = fl->flowi_oif ? : fl->flowi_l3mdev; dst = xfrm_dst_lookup(xfrm[i], tos, oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; } else dst_hold(dst); dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->lastuse = now; dst1->input = dst_discard; rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); if (likely(afinfo)) dst1->output = afinfo->output; else dst1->output = dst_discard_out; rcu_read_unlock(); xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) nfheader_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; } xfrm_dst_set_child(xdst_prev, dst); xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; xfrm_init_path(xdst0, dst, nfheader_len); xfrm_init_pmtu(bundle, nx); for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; xdst_prev->u.dst.header_len = header_len; xdst_prev->u.dst.trailer_len = trailer_len; header_len -= xdst_prev->u.dst.xfrm->props.header_len; trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) { int i; if (*num_pols == 0 || !pols[0]) { *num_pols = 0; *num_xfrms = 0; return 0; } if (IS_ERR(pols[0])) { *num_pols = 0; return PTR_ERR(pols[0]); } *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT, pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } #endif for (i = 0; i < *num_pols; i++) { if (pols[i]->action != XFRM_POLICY_ALLOW) { *num_xfrms = -1; break; } } return 0; } static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, struct dst_entry *dst_orig) { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst; struct dst_entry *dst; int err; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { if (err == 0) return NULL; if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); } xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; } static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); if (!skb) { spin_unlock(&pq->hold_queue.lock); goto out; } dst = skb_dst(skb); sk = skb->sk; /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, dst->ops->family); skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; if (dst->flags & DST_XFRM_QUEUE) { dst_release(dst); if (pq->timeout >= XFRM_QUEUE_TMO_MAX) goto purge_queue; pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); goto out; } dst_release(dst); __skb_queue_head_init(&list); spin_lock(&pq->hold_queue.lock); pq->timeout = 0; skb_queue_splice_init(&pq->hold_queue, &list); spin_unlock(&pq->hold_queue.lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); skb->mark = skb_mark; dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; } nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); dst_output(net, skb->sk, skb); } out: xfrm_pol_put(pol); return; purge_queue: pq->timeout = 0; skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { kfree_skb(skb); return -EAGAIN; } skb_dst_force(skb); spin_lock_bh(&pq->hold_queue.lock); if (!pq->timeout) pq->timeout = XFRM_QUEUE_TMO_MIN; sched_next = jiffies + pq->timeout; if (del_timer(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); } __skb_queue_tail(&pq->hold_queue, skb); if (!mod_timer(&pq->hold_timer, sched_next)) xfrm_pol_hold(pol); spin_unlock_bh(&pq->hold_queue.lock); return 0; } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; xdst = xfrm_alloc_dst(net, family); if (IS_ERR(xdst)) return xdst; if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; dst1->output = xdst_queue_output; dst_hold(dst); xfrm_dst_set_child(xdst, dst); xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; err = xfrm_fill_dst(xdst, dev, fl); if (err) goto free_dst; out: return xdst; free_dst: dst_release(dst1); xdst = ERR_PTR(err); goto out; } static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto inc_error; if (num_pols == 0) return NULL; if (num_xfrms <= 0) goto make_dummy_bundle; xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { xfrm_pols_put(pols, num_pols); return NULL; } if (err != -EAGAIN) goto error; goto make_dummy_bundle; } else if (xdst == NULL) { num_xfrms = 0; goto make_dummy_bundle; } return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { dst_release(dst_orig); return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } rcu_read_unlock(); return ret; } /* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. * * xfrm_lookup uses an if_id of 0 by default, and is provided for * compatibility */ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; xdst = NULL; route = NULL; sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto dropdst; if (num_pols) { if (num_xfrms <= 0) { drop_pols = num_pols; goto no_transform; } xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); if (err == -EREMOTE) goto nopol; goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; drop_pols = num_pols; goto no_transform; } route = xdst->route; } } if (xdst == NULL) { struct xfrm_flo xflo; xflo.dst_orig = dst_orig; xflo.flags = flags; /* To accelerate a bit... */ if (!if_id && ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { err = PTR_ERR(xdst); goto dropdst; } num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } dst = &xdst->u.dst; if (route == NULL && num_xfrms > 0) { /* The only case when xfrm_bundle_lookup() returns a * bundle with null route, is when the template could * not be resolved. It means policies are there, but * bundle could not be created, since we don't yet * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); err = -EREMOTE; goto error; } err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; } no_transform: if (num_pols == 0) goto nopol; if ((flags & XFRM_LOOKUP_ICMP) && !(pols[0]->flags & XFRM_POLICY_ICMP)) { err = -ENOENT; goto error; } for (i = 0; i < num_pols; i++) pols[i]->curlft.use_time = ktime_get_real_seconds(); if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } else if (num_xfrms > 0) { /* Flow transformed */ dst_release(dst_orig); } else { /* Flow passes untransformed */ dst_release(dst); dst = dst_orig; } ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && dst->xfrm->props.mode == XFRM_MODE_TUNNEL) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { err = -EPERM; goto error; } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; } err = -ENOENT; error: dst_release(dst); dropdst: if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } EXPORT_SYMBOL(xfrm_lookup_with_ifid); /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); } EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). * Otherwise we may send out blackholed packets. */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) dst_release(dst_orig); return dst; } EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; if (!sp || idx < 0 || idx >= sp->len) return 0; x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); } /* When skb is transformed back to its "native" form, we have to * check policy restrictions. At the moment we make this in maximally * stupid way. Shame on me. :-) Of course, connected sockets must * have policy cached at them. */ static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && xfrm_state_addr_cmp(tmpl, x, family)) && (if_id == 0 || if_id == x->if_id); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, unsigned short family, u32 if_id) { int idx = start; if (tmpl->optional) { if (tmpl->mode == XFRM_MODE_TRANSPORT) return start; } else start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (idx < sp->verified_cnt) { /* Secpath entry previously verified, consider optional and * continue searching */ continue; } if (start == -1) start = -2-idx; break; } } return start; } static void decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) { const struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl; u8 *xprth = skb_network_header(skb) + ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; fl4->flowi4_oif = reverse ? skb->skb_iif : oif; fl4->flowi4_proto = iph->protocol; fl4->daddr = reverse ? iph->saddr : iph->daddr; fl4->saddr = reverse ? iph->daddr : iph->saddr; fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK; if (!ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports; xprth = skb_network_header(skb) + ihl * 4; ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; fl4->fl4_dport = ports[!reverse]; } break; case IPPROTO_ICMP: if (xprth + 2 < skb->data || pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp; xprth = skb_network_header(skb) + ihl * 4; icmp = xprth; fl4->fl4_icmp_type = icmp[0]; fl4->fl4_icmp_code = icmp[1]; } break; case IPPROTO_GRE: if (xprth + 12 < skb->data || pskb_may_pull(skb, xprth + 12 - skb->data)) { __be16 *greflags; __be32 *gre_hdr; xprth = skb_network_header(skb) + ihl * 4; greflags = (__be16 *)xprth; gre_hdr = (__be32 *)xprth; if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) gre_hdr++; fl4->fl4_gre_key = gre_hdr[1]; } } break; default: break; } } } #if IS_ENABLED(CONFIG_IPV6) static void decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) { struct flowi6 *fl6 = &fl->u.ip6; int onlyproto = 0; const struct ipv6hdr *hdr = ipv6_hdr(skb); u32 offset = sizeof(*hdr); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u16 nhoff = IP6CB(skb)->nhoff; int oif = 0; u8 nexthdr; if (!nhoff) nhoff = offsetof(struct ipv6hdr, nexthdr); nexthdr = nh[nhoff]; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; fl6->flowi6_oif = reverse ? skb->skb_iif : oif; fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; while (nh + offset + sizeof(*exthdr) < skb->data || pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { nh = skb_network_header(skb); exthdr = (struct ipv6_opt_hdr *)(nh + offset); switch (nexthdr) { case NEXTHDR_FRAGMENT: onlyproto = 1; fallthrough; case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: offset += ipv6_optlen(exthdr); nexthdr = exthdr->nexthdr; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (!onlyproto && (nh + offset + 4 < skb->data || pskb_may_pull(skb, nh + offset + 4 - skb->data))) { __be16 *ports; nh = skb_network_header(skb); ports = (__be16 *)(nh + offset); fl6->fl6_sport = ports[!!reverse]; fl6->fl6_dport = ports[!reverse]; } fl6->flowi6_proto = nexthdr; return; case IPPROTO_ICMPV6: if (!onlyproto && (nh + offset + 2 < skb->data || pskb_may_pull(skb, nh + offset + 2 - skb->data))) { u8 *icmp; nh = skb_network_header(skb); icmp = (u8 *)(nh + offset); fl6->fl6_icmp_type = icmp[0]; fl6->fl6_icmp_code = icmp[1]; } fl6->flowi6_proto = nexthdr; return; case IPPROTO_GRE: if (!onlyproto && (nh + offset + 12 < skb->data || pskb_may_pull(skb, nh + offset + 12 - skb->data))) { struct gre_base_hdr *gre_hdr; __be32 *gre_key; nh = skb_network_header(skb); gre_hdr = (struct gre_base_hdr *)(nh + offset); gre_key = (__be32 *)(gre_hdr + 1); if (gre_hdr->flags & GRE_KEY) { if (gre_hdr->flags & GRE_CSUM) gre_key++; fl6->fl6_gre_key = *gre_key; } } fl6->flowi6_proto = nexthdr; return; #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: offset += ipv6_optlen(exthdr); if (!onlyproto && (nh + offset + 3 < skb->data || pskb_may_pull(skb, nh + offset + 3 - skb->data))) { struct ip6_mh *mh; nh = skb_network_header(skb); mh = (struct ip6_mh *)(nh + offset); fl6->fl6_mh_type = mh->ip6mh_type; } fl6->flowi6_proto = nexthdr; return; #endif default: fl6->flowi6_proto = nexthdr; return; } } } #endif int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { switch (family) { case AF_INET: decode_session4(skb, fl, reverse); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: decode_session6(skb, fl, reverse); break; #endif default: return -EAFNOSUPPORT; } return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) { for (; k < sp->len; k++) { if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { *idxp = k; return 1; } } return 0; } int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct xfrm_policy *pol; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int npols = 0; int xfrm_nr; int pi; int reverse; struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { struct xfrm_if_decode_session_result r; if (ifcb->decode_session(skb, family, &r)) { if_id = r.if_id; net = r.net; } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ sp = skb_sec_path(skb); if (sp) { int i; for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; } } } pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } } if (!pol) pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } if (!pol) { if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; } /* This lockless write can happen from different cpus. */ WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); xfrm_pol_put(pols[0]); return 0; } /* This write can happen from different cpus. */ WRITE_ONCE(pols[1]->curlft.use_time, ktime_get_real_seconds()); npols++; } } #endif if (pol->action == XFRM_POLICY_ALLOW) { static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; int ti = 0; int i, k; sp = skb_sec_path(skb); if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && pols[pi]->action != XFRM_POLICY_ALLOW) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); goto reject; } if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. * Upon success, marks secpath entries as having been * verified to allow them to be skipped in future policy * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } if (secpath_has_nontransport(sp, k, &xerr_idx)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } xfrm_pols_put(pols, npols); sp->verified_cnt = k; return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); reject_error: xfrm_pols_put(pols, npols); return 0; } EXPORT_SYMBOL(__xfrm_policy_check); int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } skb_dst_set(skb, dst); return res; } EXPORT_SYMBOL(__xfrm_route_forward); /* Optimize later using cookies and generation ids. */ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to * get validated by dst_ops->check on every use. We do this * because when a normal route referenced by an XFRM dst is * obsoleted we do not go looking around for all parent * referencing XFRM dsts so that we can invalidate them. It * is just too much work. Instead we make the checks here on * every use. For example: * * XFRM dst A --> IPv4 dst X * * X is the "xdst->route" of A (X is also the "dst->path" of A * in this example). If X is marked obsolete, "A" will not * notice. That's what we are validating here via the * stale_bundle() check. * * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; return NULL; } static int stale_bundle(struct dst_entry *dst) { return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } } EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { /* Impossible. Such dst must be popped before reaches point of failure. */ } static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) { if (dst) { if (dst->obsolete) { dst_release(dst); dst = NULL; } } return dst; } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { while (nr--) { struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; struct dst_entry *dst; dst = &xdst->u.dst; pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); } } /* Check that the bundle accepts the flow and its components are * still valid. */ static int xfrm_bundle_ok(struct xfrm_dst *first) { struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; struct xfrm_dst *xdst; int start_from, nr; u32 mtu; if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; if (xdst->num_pols > 0 && xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; bundle[nr++] = xdst; mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { start_from = nr; xdst->child_mtu_cached = mtu; } if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { start_from = nr; xdst->route_mtu_cached = mtu; } dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!start_from)) return 1; xdst = bundle[start_from - 1]; mtu = xdst->child_mtu_cached; while (start_from--) { dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); if (mtu > xdst->route_mtu_cached) mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); if (!start_from) break; xdst = bundle[start_from - 1]; xdst->child_mtu_cached = mtu; } return 1; } static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; dst = xfrm_dst_child(dst); if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } return daddr; } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); return path->ops->neigh_lookup(path, skb, daddr); } static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->mtu == NULL)) dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(!dst_ops->confirm_neigh)) dst_ops->confirm_neigh = xfrm_confirm_neigh; rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { struct dst_ops *dst_ops = afinfo->dst_ops; int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { if (xfrm_policy_afinfo[i] != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; } synchronize_rcu(); dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) { spin_lock(&xfrm_if_cb_lock); rcu_assign_pointer(xfrm_if_cb, ifcb); spin_unlock(&xfrm_if_cb_lock); } EXPORT_SYMBOL(xfrm_if_register_cb); void xfrm_if_unregister_cb(void) { RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_if_unregister_cb); #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { int rv; net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) { return 0; } static void xfrm_statistics_fini(struct net *net) { } #endif static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; int dir, err; if (net_eq(net, &init_net)) { xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", sizeof(struct xfrm_dst), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); net->xfrm.policy_byidx = xfrm_hash_alloc(sz); if (!net->xfrm.policy_byidx) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); if (!htab->table) goto out_bydst; htab->hmask = hmask; htab->dbits4 = 32; htab->sbits4 = 32; htab->dbits6 = 128; htab->sbits6 = 128; } net->xfrm.policy_hthresh.lbits4 = 32; net->xfrm.policy_hthresh.rbits4 = 32; net->xfrm.policy_hthresh.lbits6 = 128; net->xfrm.policy_hthresh.rbits6 = 128; seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; out_bydst: for (dir--; dir >= 0; dir--) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; xfrm_hash_free(htab->table, sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: return -ENOMEM; } static void xfrm_policy_fini(struct net *net) { struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); xfrm_hash_free(htab->table, sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); spin_lock_bh(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(b, true); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) { int rv; /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT; rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; rv = xfrm_state_init(net); if (rv < 0) goto out_state; rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; return 0; out_sysctl: xfrm_policy_fini(net); out_policy: xfrm_state_fini(net); out_state: xfrm_statistics_fini(net); out_statistics: return rv; } static void __net_exit xfrm_net_exit(struct net *net) { xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); xfrm_statistics_fini(net); } static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, }; void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = xp->security; struct xfrm_selector *sel = &xp->selector; if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); if (sel->prefixlen_d != 32) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); if (sel->prefixlen_s != 128) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); if (sel->prefixlen_d != 128) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; } } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, const struct xfrm_selector *sel_tgt) { if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { if (sel_tgt->family == sel_cmp->family && xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, sel_cmp->family) && xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, sel_cmp->family) && sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { return true; } } else { if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { return true; } } return false; } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; u32 priority = ~0U; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if ((if_id == 0 || pol->if_id == if_id) && xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; break; } } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if ((pol->priority >= priority) && ret) break; if ((if_id == 0 || pol->if_id == if_id) && xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; } } xfrm_pol_hold(ret); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) { int match = 0; if (t->mode == m->mode && t->id.proto == m->proto && (m->reqid == 0 || t->reqid == m->reqid)) { switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, m->old_family)) { match = 1; } break; case XFRM_MODE_TRANSPORT: /* in case of transport mode, template does not store any IP addresses, hence we just compare mode and protocol */ match = 1; break; default: break; } } return match; } /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { struct xfrm_migrate *mp; int i, j, n = 0; write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ NL_SET_ERR_MSG(extack, "Target policy not found"); write_unlock_bh(&pol->lock); return -ENOENT; } for (i = 0; i < pol->xfrm_nr; i++) { for (j = 0, mp = m; j < num_migrate; j++, mp++) { if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && pol->xfrm_vec[i].mode != XFRM_MODE_BEET) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, sizeof(pol->xfrm_vec[i].id.daddr)); memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ atomic_inc(&pol->genid); } } write_unlock_bh(&pol->lock); if (!n) return -ENODATA; return 0; } static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { int i, j; if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) { NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; } for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) { NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null"); return -EINVAL; } /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, sizeof(m[i].old_daddr)) && !memcmp(&m[i].old_saddr, &m[j].old_saddr, sizeof(m[i].old_saddr)) && m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && m[i].old_family == m[j].old_family) { NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique"); return -EINVAL; } } } return 0; } int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, struct netlink_ext_ack *extack) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; struct xfrm_state *x, *xc; struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ err = xfrm_migrate_check(m, num_migrate, extack); if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { NL_SET_ERR_MSG(extack, "Invalid policy direction"); err = -EINVAL; goto out; } /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); if (!pol) { NL_SET_ERR_MSG(extack, "Target policy not found"); err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); if (xc) { x_new[nx_new] = xc; nx_new++; } else { err = -ENODATA; goto restore_state; } } } /* Stage 3 - update policy */ err = xfrm_policy_migrate(pol, m, num_migrate, extack); if (err < 0) goto restore_state; /* Stage 4 - delete old state(s) */ if (nx_cur) { xfrm_states_put(x_cur, nx_cur); xfrm_states_delete(x_cur, nx_cur); } /* Stage 5 - announce */ km_migrate(sel, dir, type, m, num_migrate, k, encap); xfrm_pol_put(pol); return 0; out: return err; restore_state: if (pol) xfrm_pol_put(pol); if (nx_cur) xfrm_states_put(x_cur, nx_cur); if (nx_new) xfrm_states_delete(x_new, nx_new); return err; } EXPORT_SYMBOL(xfrm_migrate); #endif |
1476 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <net/udp_tunnel.h> #include <net/sch_generic.h> #include <linux/netfilter.h> #include <rdma/ib_addr.h> #include "rxe.h" #include "rxe_net.h" #include "rxe_loc.h" static struct rxe_recv_sockets recv_sockets; static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, struct in_addr *saddr, struct in_addr *daddr) { struct rtable *rt; struct flowi4 fl = { { 0 } }; memset(&fl, 0, sizeof(fl)); fl.flowi4_oif = ndev->ifindex; memcpy(&fl.saddr, saddr, sizeof(*saddr)); memcpy(&fl.daddr, daddr, sizeof(*daddr)); fl.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(&init_net, &fl); if (IS_ERR(rt)) { rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; } return &rt->dst; } #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { struct dst_entry *ndst; struct flowi6 fl6 = { { 0 } }; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = ndev->ifindex; memcpy(&fl6.saddr, saddr, sizeof(*saddr)); memcpy(&fl6.daddr, daddr, sizeof(*daddr)); fl6.flowi6_proto = IPPROTO_UDP; ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), recv_sockets.sk6->sk, &fl6, NULL); if (IS_ERR(ndst)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; } if (unlikely(ndst->error)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); goto put; } return ndst; put: dst_release(ndst); return NULL; } #else static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { return NULL; } #endif static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { struct dst_entry *dst = NULL; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); if (!dst || !dst_check(dst, qp->dst_cookie)) { if (dst) dst_release(dst); if (av->network_type == RXE_NETWORK_TYPE_IPV4) { struct in_addr *saddr; struct in_addr *daddr; saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route4(qp, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route6(qp, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); #endif } if (dst && (qp_type(qp) == IB_QPT_RC)) { dst_hold(dst); sk_dst_set(qp->sk->sk, dst); } } return dst; } static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct udphdr *udph; struct rxe_dev *rxe; struct net_device *ndev = skb->dev; struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); /* takes a reference on rxe->ib_dev * drop when skb is freed */ rxe = rxe_get_dev_from_net(ndev); if (!rxe && is_vlan_dev(ndev)) rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); if (!rxe) goto drop; if (skb_linearize(skb)) { ib_device_put(&rxe->ib_dev); goto drop; } udph = udp_hdr(skb); pkt->rxe = rxe; pkt->port_num = 1; pkt->hdr = (u8 *)(udph + 1); pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; drop: kfree_skb(skb); return 0; } static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, bool ipv6) { int err; struct socket *sock; struct udp_port_cfg udp_cfg = { }; struct udp_tunnel_sock_cfg tnl_cfg = { }; if (ipv6) { udp_cfg.family = AF_INET6; udp_cfg.ipv6_v6only = 1; } else { udp_cfg.family = AF_INET; } udp_cfg.local_udp_port = port; /* Create UDP socket */ err = udp_sock_create(net, &udp_cfg, &sock); if (err < 0) return ERR_PTR(err); tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; /* Setup UDP tunnel */ setup_udp_tunnel_sock(net, sock, &tnl_cfg); return sock; } static void rxe_release_udp_tunnel(struct socket *sk) { if (sk) udp_tunnel_sock_release(sk); } static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, __be16 dst_port) { struct udphdr *udph; __skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->dest = dst_port; udph->source = src_port; udph->len = htons(skb->len); udph->check = 0; } static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, __be32 saddr, __be32 daddr, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { struct iphdr *iph; skb_scrub_packet(skb, xnet); skb_clear_hash(skb); skb_dst_set(skb, dst_clone(dst)); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = IPVERSION; iph->ihl = sizeof(struct iphdr) >> 2; iph->tot_len = htons(skb->len); iph->frag_off = df; iph->protocol = proto; iph->tos = tos; iph->daddr = daddr; iph->saddr = saddr; iph->ttl = ttl; __ip_select_ident(dev_net(dst->dev), iph, skb_shinfo(skb)->gso_segs ?: 1); } static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr, __u8 proto, __u8 prio, __u8 ttl) { struct ipv6hdr *ip6h; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_set(skb, dst_clone(dst)); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, htonl(0)); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = proto; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; bool xnet = false; __be16 df = htons(IP_DF); struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); dst_release(dst); return 0; } static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit); dst_release(dst); return 0; } int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err = 0; if (skb->protocol == htons(ETH_P_IP)) err = prepare4(av, pkt, skb); else if (skb->protocol == htons(ETH_P_IPV6)) err = prepare6(av, pkt, skb); if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) pkt->mask |= RXE_LOOPBACK_MASK; return err; } static void rxe_skb_tx_dtor(struct sk_buff *skb) { struct sock *sk = skb->sk; struct rxe_qp *qp = sk->sk_user_data; int skb_out = atomic_dec_return(&qp->skb_out); if (unlikely(qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) rxe_sched_task(&qp->req.task); rxe_put(qp); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; skb->destructor = rxe_skb_tx_dtor; skb->sk = pkt->qp->sk->sk; rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) { err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else if (skb->protocol == htons(ETH_P_IPV6)) { err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else { rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", skb->protocol); atomic_dec(&pkt->qp->skb_out); rxe_put(pkt->qp); kfree_skb(skb); return -EINVAL; } if (unlikely(net_xmit_eval(err))) { rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); return -EAGAIN; } return 0; } /* fix up a send packet to match the packets * received from UDP before looping them back */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else skb_pull(skb, sizeof(struct ipv6hdr)); if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) { kfree_skb(skb); return -EIO; } /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; } int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); if (pkt->mask & RXE_LOOPBACK_MASK) err = rxe_loopback(skb, pkt); else err = rxe_send(skb, pkt); if (err) { rxe_counter_inc(rxe, RXE_CNT_SEND_ERR); return err; } if ((qp_type(qp) != IB_QPT_RC) && (pkt->mask & RXE_END_MASK)) { pkt->wqe->state = wqe_state_done; rxe_sched_task(&qp->comp.task); } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; drop: kfree_skb(skb); err = 0; done: return err; } struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { unsigned int hdr_len; struct sk_buff *skb = NULL; struct net_device *ndev; const struct ib_gid_attr *attr; const int port_num = 1; attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); if (IS_ERR(attr)) return NULL; if (av->network_type == RXE_NETWORK_TYPE_IPV4) hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct iphdr); else hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct ipv6hdr); rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (IS_ERR(ndev)) { rcu_read_unlock(); goto out; } skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), GFP_ATOMIC); if (unlikely(!skb)) { rcu_read_unlock(); goto out; } skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); /* FIXME: hold reference to this netdev until life of this skb. */ skb->dev = ndev; rcu_read_unlock(); if (av->network_type == RXE_NETWORK_TYPE_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); pkt->rxe = rxe; pkt->port_num = port_num; pkt->hdr = skb_put(skb, paylen); pkt->mask |= RXE_GRH_MASK; out: rdma_put_gid_attr(attr); return skb; } /* * this is required by rxe_cfg to match rxe devices in * /sys/class/infiniband up with their underlying ethernet devices */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { return rxe->ndev->name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) { int err; struct rxe_dev *rxe = NULL; rxe = ib_alloc_device(rxe_dev, ib_dev); if (!rxe) return -ENOMEM; rxe->ndev = ndev; err = rxe_add(rxe, ndev->mtu, ibdev_name); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; } return 0; } static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) { struct ib_event ev; ev.device = &rxe->ib_dev; ev.element.port_num = 1; ev.event = event; ib_dispatch_event(&ev); } /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { struct rxe_port *port; port = &rxe->port; port->attr.state = IB_PORT_ACTIVE; rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { struct rxe_port *port; port = &rxe->port; port->attr.state = IB_PORT_DOWN; rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); } void rxe_set_port_state(struct rxe_dev *rxe) { if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) rxe_port_up(rxe); else rxe_port_down(rxe); } static int rxe_notify(struct notifier_block *not_blk, unsigned long event, void *arg) { struct net_device *ndev = netdev_notifier_info_to_dev(arg); struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); if (!rxe) return NOTIFY_OK; switch (event) { case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; case NETDEV_UP: rxe_port_up(rxe); break; case NETDEV_DOWN: rxe_port_down(rxe); break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_CHANGE: rxe_set_port_state(rxe); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } ib_device_put(&rxe->ib_dev); return NOTIFY_OK; } static struct notifier_block rxe_net_notifier = { .notifier_call = rxe_notify, }; static int rxe_net_ipv4_init(void) { recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false); if (IS_ERR(recv_sockets.sk4)) { recv_sockets.sk4 = NULL; pr_err("Failed to create IPv4 UDP tunnel\n"); return -1; } return 0; } static int rxe_net_ipv6_init(void) { #if IS_ENABLED(CONFIG_IPV6) recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true); if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { recv_sockets.sk6 = NULL; pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); return 0; } if (IS_ERR(recv_sockets.sk6)) { recv_sockets.sk6 = NULL; pr_err("Failed to create IPv6 UDP tunnel\n"); return -1; } #endif return 0; } void rxe_net_exit(void) { rxe_release_udp_tunnel(recv_sockets.sk6); rxe_release_udp_tunnel(recv_sockets.sk4); unregister_netdevice_notifier(&rxe_net_notifier); } int rxe_net_init(void) { int err; recv_sockets.sk6 = NULL; err = rxe_net_ipv4_init(); if (err) return err; err = rxe_net_ipv6_init(); if (err) goto err_out; err = register_netdevice_notifier(&rxe_net_notifier); if (err) { pr_err("Failed to register netdev notifier\n"); goto err_out; } return 0; err_out: rxe_net_exit(); return err; } |
5 5 5 11 9 8 6 5 11 5 5 5 5 2 2 5 5 5 5 5 5 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/rcupdate.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (!map->ops->map_local_storage_charge) return 0; return map->ops->map_local_storage_charge(smap, owner, size); } static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (map->ops->map_local_storage_uncharge) map->ops->map_local_storage_uncharge(smap, owner, size); } static struct bpf_local_storage __rcu ** owner_storage(struct bpf_local_storage_map *smap, void *owner) { struct bpf_map *map = &smap->map; return map->ops->map_owner_storage_ptr(owner); } static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->snode); } static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); } static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->map_node); } static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); } struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; if (smap->bpf_ma) { migrate_disable(); selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); migrate_enable(); if (selem) /* Keep the original bpf_map_kzalloc behavior * before started using the bpf_mem_cache_alloc. * * No need to use zero_map_value. The bpf_selem_free() * only does bpf_mem_cache_free when there is * no other bpf prog is using the selem. */ memset(SDATA(selem)->data, 0, smap->map.value_size); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); /* No need to call check_and_init_map_value as memory is zero init */ return selem; } if (charge_mem) mem_uncharge(smap, owner, smap->elem_size); return NULL; } /* rcu tasks trace callback for bpf_ma == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; /* If RCU Tasks Trace grace period implies RCU grace period, do * kfree(), else do kfree_rcu(). */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); if (rcu_trace_implies_rcu_gp()) kfree(local_storage); else kfree_rcu(local_storage, rcu); } static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); bpf_mem_cache_raw_free(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_local_storage_free_rcu(rcu); else call_rcu(rcu, bpf_local_storage_free_rcu); } /* Handle bpf_ma == false */ static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, bool vanilla_rcu) { if (vanilla_rcu) kfree_rcu(local_storage, rcu); else call_rcu_tasks_trace(&local_storage->rcu, __bpf_local_storage_free_trace_rcu); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool bpf_ma, bool reuse_now) { if (!local_storage) return; if (!bpf_ma) { __bpf_local_storage_free(local_storage, reuse_now); return; } if (!reuse_now) { call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_trace_rcu); return; } if (smap) { migrate_disable(); bpf_mem_cache_free(&smap->storage_ma, local_storage); migrate_enable(); } else { /* smap could be NULL if the selem that triggered * this 'local_storage' creation had been long gone. * In this case, directly do call_rcu(). */ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); } } /* rcu tasks trace callback for bpf_ma == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); if (rcu_trace_implies_rcu_gp()) kfree(selem); else kfree_rcu(selem, rcu); } /* Handle bpf_ma == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { if (vanilla_rcu) kfree_rcu(selem, rcu); else call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); } static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); bpf_mem_cache_raw_free(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_selem_free_rcu(rcu); else call_rcu(rcu, bpf_selem_free_rcu); } void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); if (!smap->bpf_ma) { __bpf_selem_free(selem, reuse_now); return; } if (!reuse_now) { call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); } else { /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. */ migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); } } /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, bool uncharge_mem, bool reuse_now) { struct bpf_local_storage_map *smap; bool free_local_storage; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); owner = local_storage->owner; /* All uncharging on the owner must be done first. * The owner may be freed once the last selem is unlinked * from local_storage. */ if (uncharge_mem) mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); if (free_local_storage) { mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); local_storage->owner = NULL; /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); /* local_storage is not freed now. local_storage->lock is * still held and raw_spin_unlock_bh(&local_storage->lock) * will be done by the caller. * * Although the unlock will be done under * rcu_read_lock(), it is more intuitive to * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * * Hence, a "bool free_local_storage" is returned * to the caller which then calls then frees the storage after * all the RCU grace periods have expired. */ } hlist_del_init_rcu(&selem->snode); if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); bpf_selem_free(selem, smap, reuse_now); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); return free_local_storage; } static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *storage_smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *selem_smap; /* local_storage->smap may be NULL. If it is, get the bpf_ma * from any selem in the local_storage->list. The bpf_ma of all * local_storage and selem should have the same value * for the same map type. * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not * responsibile to free the local_storage. */ if (storage_smap) return storage_smap->bpf_ma; if (!selem) { struct hlist_node *n; n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), bpf_rcu_lock_held()); if (!n) return false; selem = hlist_entry(n, struct bpf_local_storage_elem, snode); } selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); return selem_smap->bpf_ma; } static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true, reuse_now); raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ return; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); bpf_selem_unlink_storage(selem, reuse_now); } /* If cacheit_lockit is false, this lookup function is lockless */ struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit) { struct bpf_local_storage_data *sdata; struct bpf_local_storage_elem *selem; /* Fast path (cache hit) */ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], bpf_rcu_lock_held()); if (sdata && rcu_access_pointer(sdata->smap) == smap) return sdata; /* Slow path (cache miss) */ hlist_for_each_entry_rcu(selem, &local_storage->list, snode, rcu_read_lock_trace_held()) if (rcu_access_pointer(SDATA(selem)->smap) == smap) break; if (!selem) return NULL; sdata = SDATA(selem); if (cacheit_lockit) { unsigned long flags; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; } static int check_flags(const struct bpf_local_storage_data *old_sdata, u64 map_flags) { if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; return 0; } int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *first_selem, gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; int err; err = mem_charge(smap, owner, sizeof(*storage)); if (err) return err; if (smap->bpf_ma) { migrate_disable(); storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); migrate_enable(); } else { storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); } if (!storage) { err = -ENOMEM; goto uncharge; } RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); /* Publish storage to the owner. * Instead of using any lock of the kernel object (i.e. owner), * cmpxchg will work with any kernel object regardless what * the running context is, bh, irq...etc. * * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) * is protected by the storage->lock. Hence, when freeing * the owner->storage, the storage->lock must be held before * setting owner->storage ptr to NULL. */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; /* Note that even first_selem was linked to smap's * bucket->list, first_selem can be freed immediately * (instead of kfree_rcu) because * bpf_local_storage_map_free() does a * synchronize_rcu_mult (waiting for both sleepable and * normal programs) before walking the bucket->list. * Hence, no one is accessing selem from the * bucket->list under rcu_read_lock(). */ } return 0; uncharge: bpf_local_storage_free(storage, smap, smap->bpf_ma, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } /* sk cannot be going away because it is linking new elem * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). * Otherwise, it will become a leak (and other memory issues * during map destruction). */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) return ERR_PTR(-EINVAL); local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { bpf_selem_free(selem, smap, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } return SDATA(selem); } if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { /* Hoping to find an old_sdata to do inline update * such that it can avoid taking the local_storage->lock * and changing the lists. */ old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) return ERR_PTR(err); if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; } } /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { /* A parallel del is happening and local_storage is going * away. It has just been checked before, so very * unlikely. Return instead of retry to keep things * simple. */ err = -EAGAIN; goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); selem = SELEM(old_sdata); goto unlock; } alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); } return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; spin_lock(&cache->idx_lock); for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { if (cache->idx_usage_counts[i] < min_usage) { min_usage = cache->idx_usage_counts[i]; res = i; /* Found a free cache_idx */ if (!min_usage) break; } } cache->idx_usage_counts[res]++; spin_unlock(&cache->idx_lock); return res; } static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; return 0; } int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { u32 int_data; if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; } void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; struct hlist_node *n; unsigned long flags; storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the * local_storage->list by the bpf_prog or by the bpf_map's syscall. * * It is racing with bpf_local_storage_map_free() alone * when unlinking elem from the local_storage->list and * the map's bucket->list. */ raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. */ bpf_selem_unlink_map(selem); /* If local_storage list has only one element, the * bpf_selem_unlink_storage_nolock() will return true. * Otherwise, it will return false. The current loop iteration * intends to remove all local storage. So the last iteration * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) { struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; u64 usage = sizeof(*smap); /* The dynamically callocated selems are not counted currently. */ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); return usage; } /* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. * A deadlock free allocator is useful for storage that the bpf prog can easily * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses * memory immediately. To be reuse-immediate safe, the owner destruction * code path needs to go through a rcu grace period before calling * bpf_local_storage_destroy(). * * When bpf_ma == false, the kmalloc and kfree are used. */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, bool bpf_ma) { struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; int err; smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; } for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); raw_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); smap->bpf_ma = bpf_ma; if (bpf_ma) { err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); if (err) goto free_smap; err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); if (err) { bpf_mem_alloc_destroy(&smap->selem_ma); goto free_smap; } } smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; free_smap: kvfree(smap->buckets); bpf_map_area_free(smap); return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, int __percpu *busy_counter) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; unsigned int i; smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(cache, smap->cache_idx); /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU * read sections should be prevented via bpf_map_inc_not_zero. */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map * now. No new selem (of this map) can be added * to the owner->storage or to the map bucket's list. * * The elem of this map can be cleaned up here * or when the storage is freed e.g. * by bpf_sk_storage_free() during __sk_destruct(). */ for (i = 0; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; rcu_read_lock(); /* No one is adding to b->list now */ while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); this_cpu_inc(*busy_counter); } bpf_selem_unlink(selem, true); if (busy_counter) { this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); } rcu_read_unlock(); } /* While freeing the storage we may still need to access the map. * * e.g. when bpf_sk_storage_free() has unlinked selem from the map * which then made the above while((selem = ...)) loop * exit immediately. * * However, while freeing the storage one still needs to access the * smap->elem_size to do the uncharging in * bpf_selem_unlink_storage_nolock(). * * Hence, wait another rcu grace period for the storage to be freed. */ synchronize_rcu(); if (smap->bpf_ma) { bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } kvfree(smap->buckets); bpf_map_area_free(smap); } |
36 35 17 21 17 17 1 1 1 1 1 1 1 17 17 17 17 1 1 1 1 1 1 17 17 17 17 17 17 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #include "translation-table.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "soft-interface.h" #include "tvlv.h" static struct kmem_cache *batadv_tl_cache __read_mostly; static struct kmem_cache *batadv_tg_cache __read_mostly; static struct kmem_cache *batadv_tt_orig_cache __read_mostly; static struct kmem_cache *batadv_tt_change_cache __read_mostly; static struct kmem_cache *batadv_tt_req_cache __read_mostly; static struct kmem_cache *batadv_tt_roam_cache __read_mostly; /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node); static void batadv_tt_purge(struct work_struct *work); static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry); static void batadv_tt_global_del(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid, const char *message, bool roaming); /** * batadv_compare_tt() - check if two TT entries are the same * @node: the list element pointer of the first TT entry * @data2: pointer to the tt_common_entry of the second TT entry * * Compare the MAC address and the VLAN ID of the two TT entries and check if * they are the same TT client. * Return: true if the two TT clients are the same, false otherwise */ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); const struct batadv_tt_common_entry *tt1 = data1; const struct batadv_tt_common_entry *tt2 = data2; return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2); } /** * batadv_choose_tt() - return the index of the tt entry in the hash table * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * * Return: the hash index where the object represented by 'data' should be * stored at. */ static inline u32 batadv_choose_tt(const void *data, u32 size) { const struct batadv_tt_common_entry *tt; u32 hash = 0; tt = data; hash = jhash(&tt->addr, ETH_ALEN, hash); hash = jhash(&tt->vid, sizeof(tt->vid), hash); return hash % size; } /** * batadv_tt_hash_find() - look for a client in the given hash table * @hash: the hash table to search * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the tt_common struct belonging to the searched client if * found, NULL otherwise. */ static struct batadv_tt_common_entry * batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, unsigned short vid) { struct hlist_head *head; struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL; u32 index; if (!hash) return NULL; ether_addr_copy(to_search.addr, addr); to_search.vid = vid; index = batadv_choose_tt(&to_search, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(tt, head, hash_entry) { if (!batadv_compare_eth(tt, addr)) continue; if (tt->vid != vid) continue; if (!kref_get_unless_zero(&tt->refcount)) continue; tt_tmp = tt; break; } rcu_read_unlock(); return tt_tmp; } /** * batadv_tt_local_hash_find() - search the local table for a given client * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the corresponding tt_local_entry struct if the client is * found, NULL otherwise. */ static struct batadv_tt_local_entry * batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local_entry = NULL; tt_common_entry = batadv_tt_hash_find(bat_priv->tt.local_hash, addr, vid); if (tt_common_entry) tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, common); return tt_local_entry; } /** * batadv_tt_global_hash_find() - search the global table for a given client * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global_entry = NULL; tt_common_entry = batadv_tt_hash_find(bat_priv->tt.global_hash, addr, vid); if (tt_common_entry) tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, common); return tt_global_entry; } /** * batadv_tt_local_entry_free_rcu() - free the tt_local_entry * @rcu: rcu pointer of the tt_local_entry */ static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_local_entry *tt_local_entry; tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, common.rcu); kmem_cache_free(batadv_tl_cache, tt_local_entry); } /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_tt_local_entry_release(struct kref *ref) { struct batadv_tt_local_entry *tt_local_entry; tt_local_entry = container_of(ref, struct batadv_tt_local_entry, common.refcount); batadv_softif_vlan_put(tt_local_entry->vlan); call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); } /** * batadv_tt_local_entry_put() - decrement the tt_local_entry refcounter and * possibly release it * @tt_local_entry: tt_local_entry to be free'd */ static void batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) { if (!tt_local_entry) return; kref_put(&tt_local_entry->common.refcount, batadv_tt_local_entry_release); } /** * batadv_tt_global_entry_free_rcu() - free the tt_global_entry * @rcu: rcu pointer of the tt_global_entry */ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_global_entry *tt_global_entry; tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, common.rcu); kmem_cache_free(batadv_tg_cache, tt_global_entry); } /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the nc_node */ void batadv_tt_global_entry_release(struct kref *ref) { struct batadv_tt_global_entry *tt_global_entry; tt_global_entry = container_of(ref, struct batadv_tt_global_entry, common.refcount); batadv_tt_global_del_orig_list(tt_global_entry); call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); } /** * batadv_tt_global_hash_count() - count the number of orig entries * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; int count; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) return 0; count = atomic_read(&tt_global_entry->orig_list_count); batadv_tt_global_entry_put(tt_global_entry); return count; } /** * batadv_tt_local_size_mod() - change the size by v of the local table * identified by vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier of the sub-table to change * @v: the amount to sum to the local table size */ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, unsigned short vid, int v) { struct batadv_softif_vlan *vlan; vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return; atomic_add(v, &vlan->tt.num_entries); batadv_softif_vlan_put(vlan); } /** * batadv_tt_local_size_inc() - increase by one the local table size for the * given vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv, unsigned short vid) { batadv_tt_local_size_mod(bat_priv, vid, 1); } /** * batadv_tt_local_size_dec() - decrease by one the local table size for the * given vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, unsigned short vid) { batadv_tt_local_size_mod(bat_priv, vid, -1); } /** * batadv_tt_global_size_mod() - change the size by v of the global table * for orig_node identified by vid * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier * @v: the amount to sum to the global table size */ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, unsigned short vid, int v) { struct batadv_orig_node_vlan *vlan; vlan = batadv_orig_node_vlan_new(orig_node, vid); if (!vlan) return; if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); if (!hlist_unhashed(&vlan->list)) { hlist_del_init_rcu(&vlan->list); batadv_orig_node_vlan_put(vlan); } spin_unlock_bh(&orig_node->vlan_list_lock); } batadv_orig_node_vlan_put(vlan); } /** * batadv_tt_global_size_inc() - increase by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier */ static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node, unsigned short vid) { batadv_tt_global_size_mod(orig_node, vid, 1); } /** * batadv_tt_global_size_dec() - decrease by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier */ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, unsigned short vid) { batadv_tt_global_size_mod(orig_node, vid, -1); } /** * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry * @rcu: rcu pointer of the orig_entry */ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); kmem_cache_free(batadv_tt_orig_cache, orig_entry); } /** * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry */ static void batadv_tt_orig_list_entry_release(struct kref *ref) { struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, refcount); batadv_orig_node_put(orig_entry->orig_node); call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); } /** * batadv_tt_orig_list_entry_put() - decrement the tt orig entry refcounter and * possibly release it * @orig_entry: tt orig entry to be free'd */ static void batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { if (!orig_entry) return; kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } /** * batadv_tt_local_event() - store a local TT event (ADD/DEL) * @bat_priv: the bat priv with all the soft interface information * @tt_local_entry: the TT entry involved in the event * @event_flags: flags to store in the event structure */ static void batadv_tt_local_event(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, u8 event_flags) { struct batadv_tt_change_node *tt_change_node, *entry, *safe; struct batadv_tt_common_entry *common = &tt_local_entry->common; u8 flags = common->flags | event_flags; bool event_removed = false; bool del_op_requested, del_op_entry; tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC); if (!tt_change_node) return; tt_change_node->change.flags = flags; memset(tt_change_node->change.reserved, 0, sizeof(tt_change_node->change.reserved)); ether_addr_copy(tt_change_node->change.addr, common->addr); tt_change_node->change.vid = htons(common->vid); del_op_requested = flags & BATADV_TT_CLIENT_DEL; /* check for ADD+DEL or DEL+ADD events */ spin_lock_bh(&bat_priv->tt.changes_list_lock); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (!batadv_compare_eth(entry->change.addr, common->addr)) continue; /* DEL+ADD in the same orig interval have no effect and can be * removed to avoid silly behaviour on the receiver side. The * other way around (ADD+DEL) can happen in case of roaming of * a client still in the NEW state. Roaming of NEW clients is * now possible due to automatically recognition of "temporary" * clients */ del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL; if (!del_op_requested && del_op_entry) goto del; if (del_op_requested && !del_op_entry) goto del; /* this is a second add in the same originator interval. It * means that flags have been changed: update them! */ if (!del_op_requested && !del_op_entry) entry->change.flags = flags; continue; del: list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); kmem_cache_free(batadv_tt_change_cache, tt_change_node); event_removed = true; goto unlock; } /* track the change in the OGMinterval list */ list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list); unlock: spin_unlock_bh(&bat_priv->tt.changes_list_lock); if (event_removed) atomic_dec(&bat_priv->tt.local_changes); else atomic_inc(&bat_priv->tt.local_changes); } /** * batadv_tt_len() - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * * Return: computed length in bytes. */ static int batadv_tt_len(int changes_num) { return changes_num * sizeof(struct batadv_tvlv_tt_change); } /** * batadv_tt_entries() - compute the number of entries fitting in tt_len bytes * @tt_len: available space * * Return: the number of entries. */ static u16 batadv_tt_entries(u16 tt_len) { return tt_len / batadv_tt_len(1); } /** * batadv_tt_local_table_transmit_size() - calculates the local translation * table size when transmitted over the air * @bat_priv: the bat priv with all the soft interface information * * Return: local translation table size in bytes. */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { u16 num_vlan = 0; u16 tt_local_entries = 0; struct batadv_softif_vlan *vlan; int hdr_size; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { num_vlan++; tt_local_entries += atomic_read(&vlan->tt.num_entries); } rcu_read_unlock(); /* header size of tvlv encapsulated tt response payload */ hdr_size = sizeof(struct batadv_unicast_tvlv_packet); hdr_size += sizeof(struct batadv_tvlv_hdr); hdr_size += sizeof(struct batadv_tvlv_tt_data); hdr_size += num_vlan * sizeof(struct batadv_tvlv_tt_vlan_data); return hdr_size + batadv_tt_len(tt_local_entries); } static int batadv_tt_local_init(struct batadv_priv *bat_priv) { if (bat_priv->tt.local_hash) return 0; bat_priv->tt.local_hash = batadv_hash_new(1024); if (!bat_priv->tt.local_hash) return -ENOMEM; batadv_hash_set_lock_class(bat_priv->tt.local_hash, &batadv_tt_local_hash_lock_class_key); return 0; } static void batadv_tt_global_free(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global, const char *message) { struct batadv_tt_global_entry *tt_removed_entry; struct hlist_node *tt_removed_node; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), message); tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); if (!tt_removed_node) return; /* drop reference of remove hash entry */ tt_removed_entry = hlist_entry(tt_removed_node, struct batadv_tt_global_entry, common.hash_entry); batadv_tt_global_entry_put(tt_removed_entry); } /** * batadv_tt_local_add() - add a new client to the local table or update an * existing client * @soft_iface: netdev struct of the mesh interface * @addr: the mac address of the client to add * @vid: VLAN identifier * @ifindex: index of the interface where the client is connected to (useful to * identify wireless clients) * @mark: the value contained in the skb->mark field of the received packet (if * any) * * Return: true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; struct net *net = dev_net(soft_iface); struct batadv_softif_vlan *vlan; struct net_device *in_dev = NULL; struct batadv_hard_iface *in_hardif = NULL; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; bool ret = false; bool roamed_back = false; u8 remote_flags; u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) in_dev = dev_get_by_index(net, ifindex); if (in_dev) in_hardif = batadv_hardif_get_by_netdev(in_dev); tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!is_multicast_ether_addr(addr)) tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (tt_local) { tt_local->last_seen = jiffies; if (tt_local->common.flags & BATADV_TT_CLIENT_PENDING) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Re-adding pending client %pM (vid: %d)\n", addr, batadv_print_vid(vid)); /* whatever the reason why the PENDING flag was set, * this is a client which was enqueued to be removed in * this orig_interval. Since it popped up again, the * flag can be reset like it was never enqueued */ tt_local->common.flags &= ~BATADV_TT_CLIENT_PENDING; goto add_event; } if (tt_local->common.flags & BATADV_TT_CLIENT_ROAM) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Roaming client %pM (vid: %d) came back to its original location\n", addr, batadv_print_vid(vid)); /* the ROAM flag is set because this client roamed away * and the node got a roaming_advertisement message. Now * that the client popped up again at its original * location such flag can be unset */ tt_local->common.flags &= ~BATADV_TT_CLIENT_ROAM; roamed_back = true; } goto check_roaming; } /* Ignore the client if we cannot send it in a full table response. */ table_size = batadv_tt_local_table_transmit_size(bat_priv); table_size += batadv_tt_len(1); packet_size_max = atomic_read(&bat_priv->packet_size_max); if (table_size > packet_size_max) { net_ratelimited_function(batadv_info, soft_iface, "Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n", table_size, packet_size_max, addr); goto out; } tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC); if (!tt_local) goto out; /* increase the refcounter of the related vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) { net_ratelimited_function(batadv_info, soft_iface, "adding TT local entry %pM to non-existent VLAN %d\n", addr, batadv_print_vid(vid)); kmem_cache_free(batadv_tl_cache, tt_local); tt_local = NULL; goto out; } batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, batadv_print_vid(vid), (u8)atomic_read(&bat_priv->tt.vn)); ether_addr_copy(tt_local->common.addr, addr); /* The local entry has to be marked as NEW to avoid to send it in * a full table response going out before the next ttvn increment * (consistency check) */ tt_local->common.flags = BATADV_TT_CLIENT_NEW; tt_local->common.vid = vid; if (batadv_is_wifi_hardif(in_hardif)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; kref_init(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; tt_local->vlan = vlan; /* the batman interface mac and multicast addresses should never be * purged */ if (batadv_compare_eth(addr, soft_iface->dev_addr) || is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; kref_get(&tt_local->common.refcount); hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local->common, &tt_local->common.hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_put(tt_local); goto out; } add_event: batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); check_roaming: /* Check whether it is a roaming, but don't do anything if the roaming * process has already been handled */ if (tt_global && !(tt_global->common.flags & BATADV_TT_CLIENT_ROAM)) { /* These node are probably going to update their tt table */ head = &tt_global->orig_list; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, head, list) { batadv_send_roam_adv(bat_priv, tt_global->common.addr, tt_global->common.vid, orig_entry->orig_node); } rcu_read_unlock(); if (roamed_back) { batadv_tt_global_free(bat_priv, tt_global, "Roaming canceled"); } else { /* The global entry has to be marked as ROAMING and * has to be kept for consistency purpose */ tt_global->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global->roam_at = jiffies; } } /* store the current remote flags before altering them. This helps * understanding is flags are changing or not */ remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK; if (batadv_is_wifi_hardif(in_hardif)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; else tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI; /* check the mark in the skb: if it's equal to the configured * isolation_mark, it means the packet is coming from an isolated * non-mesh client */ match_mark = (mark & bat_priv->isolation_mark_mask); if (bat_priv->isolation_mark_mask && match_mark == bat_priv->isolation_mark) tt_local->common.flags |= BATADV_TT_CLIENT_ISOLA; else tt_local->common.flags &= ~BATADV_TT_CLIENT_ISOLA; /* if any "dynamic" flag has been modified, resend an ADD event for this * entry so that all the nodes can get the new flags */ if (remote_flags ^ (tt_local->common.flags & BATADV_TT_REMOTE_MASK)) batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); ret = true; out: batadv_hardif_put(in_hardif); dev_put(in_dev); batadv_tt_local_entry_put(tt_local); batadv_tt_global_entry_put(tt_global); return ret; } /** * batadv_tt_prepare_tvlv_global_data() - prepare the TVLV TT header to send * within a TT Response directed to another node * @orig_node: originator for which the TT data has to be prepared * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changed can be stored * @tt_len: pointer to the length to reserve to the tt_change. if -1 this * function reserves the amount of space needed to send the entire global TT * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { u16 num_vlan = 0; u16 num_entries = 0; u16 change_offset; u16 tvlv_len; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; u8 *tt_change_ptr; spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { num_vlan++; num_entries += atomic_read(&vlan->tt.num_entries); } change_offset = sizeof(**tt_data); change_offset += num_vlan * sizeof(*tt_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) *tt_len = batadv_tt_len(num_entries); tvlv_len = *tt_len; tvlv_len += change_offset; *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { *tt_len = 0; goto out; } (*tt_data)->flags = BATADV_NO_FLAGS; (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn); (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; } tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: spin_unlock_bh(&orig_node->vlan_list_lock); return tvlv_len; } /** * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for * this node * @bat_priv: the bat priv with all the soft interface information * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changes can be stored * @tt_len: pointer to the length to reserve to the tt_change. if -1 this * function reserves the amount of space needed to send the entire local TT * table. In case of success the value is updated with the real amount of * reserved bytes * * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN. * * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_softif_vlan *vlan; u16 num_vlan = 0; u16 vlan_entries = 0; u16 total_entries = 0; u16 tvlv_len; u8 *tt_change_ptr; int change_offset; spin_lock_bh(&bat_priv->softif_vlan_list_lock); hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; num_vlan++; total_entries += vlan_entries; } change_offset = sizeof(**tt_data); change_offset += num_vlan * sizeof(*tt_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) *tt_len = batadv_tt_len(total_entries); tvlv_len = *tt_len; tvlv_len += change_offset; *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { tvlv_len = 0; goto out; } (*tt_data)->flags = BATADV_NO_FLAGS; (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn); (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; } tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return tvlv_len; } /** * batadv_tt_tvlv_container_update() - update the translation table tvlv * container after local tt changes have been committed * @bat_priv: the bat priv with all the soft interface information */ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) { struct batadv_tt_change_node *entry, *safe; struct batadv_tvlv_tt_data *tt_data; struct batadv_tvlv_tt_change *tt_change; int tt_diff_len, tt_change_len = 0; int tt_diff_entries_num = 0; int tt_diff_entries_count = 0; u16 tvlv_len; tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); tt_diff_len = batadv_tt_len(tt_diff_entries_num); /* if we have too many changes for one packet don't send any * and wait for the tt table request which will be fragmented */ if (tt_diff_len > bat_priv->soft_iface->mtu) tt_diff_len = 0; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data, &tt_change, &tt_diff_len); if (!tvlv_len) return; tt_data->flags = BATADV_TT_OGM_DIFF; if (tt_diff_len == 0) goto container_register; spin_lock_bh(&bat_priv->tt.changes_list_lock); atomic_set(&bat_priv->tt.local_changes, 0); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (tt_diff_entries_count < tt_diff_entries_num) { memcpy(tt_change + tt_diff_entries_count, &entry->change, sizeof(struct batadv_tvlv_tt_change)); tt_diff_entries_count++; } list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); } spin_unlock_bh(&bat_priv->tt.changes_list_lock); /* Keep the buffer for possible tt_request */ spin_lock_bh(&bat_priv->tt.last_changeset_lock); kfree(bat_priv->tt.last_changeset); bat_priv->tt.last_changeset_len = 0; bat_priv->tt.last_changeset = NULL; tt_change_len = batadv_tt_len(tt_diff_entries_count); /* check whether this new OGM has no changes due to size problems */ if (tt_diff_entries_count > 0) { /* if kmalloc() fails we will reply with the full table * instead of providing the diff */ bat_priv->tt.last_changeset = kzalloc(tt_diff_len, GFP_ATOMIC); if (bat_priv->tt.last_changeset) { memcpy(bat_priv->tt.last_changeset, tt_change, tt_change_len); bat_priv->tt.last_changeset_len = tt_diff_len; } } spin_unlock_bh(&bat_priv->tt.last_changeset_lock); container_register: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data, tvlv_len); kfree(tt_data); } /** * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success */ static int batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common) { void *hdr; struct batadv_softif_vlan *vlan; struct batadv_tt_local_entry *local; unsigned int last_seen_msecs; u32 crc; local = container_of(common, struct batadv_tt_local_entry, common); last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); vlan = batadv_softif_vlan_get(bat_priv, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; batadv_softif_vlan_put(vlan); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_LOCAL); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) goto nla_put_failure; if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) && nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @hash: hash to dump * @bucket: bucket index to dump * @idx_s: Number of entries to skip * * Return: Error code, or 0 on success */ static int batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hashtable *hash, unsigned int bucket, int *idx_s) { struct batadv_tt_common_entry *common; int idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(common, &hash->table[bucket], hash_entry) { if (idx++ < *idx_s) continue; if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv, common)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = idx - 1; return -EMSGSIZE; } } spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = 0; return 0; } /** * batadv_tt_local_dump() - Dump TT local entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: Error code, or 0 on success */ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; int ret; int ifindex; int bucket = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } hash = bat_priv->tt.local_hash; while (bucket < hash->size) { if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv, hash, bucket, &idx)) break; bucket++; } ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; return ret; } static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, u16 flags, const char *message) { batadv_tt_local_event(bat_priv, tt_local_entry, flags); /* The local client has to be marked as "pending to be removed" but has * to be kept in the table in order to send it in a full table * response issued before the net ttvn increment (consistency check) */ tt_local_entry->common.flags |= BATADV_TT_CLIENT_PENDING; batadv_dbg(BATADV_DBG_TT, bat_priv, "Local tt entry (%pM, vid: %d) pending to be removed: %s\n", tt_local_entry->common.addr, batadv_print_vid(tt_local_entry->common.vid), message); } /** * batadv_tt_local_remove() - logically remove an entry from the local table * @bat_priv: the bat priv with all the soft interface information * @addr: the MAC address of the client to remove * @vid: VLAN identifier * @message: message to append to the log on deletion * @roaming: true if the deletion is due to a roaming event * * Return: the flags assigned to the local entry before being deleted */ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming) { struct batadv_tt_local_entry *tt_removed_entry; struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; struct hlist_node *tt_removed_node; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; curr_flags = tt_local_entry->common.flags; flags = BATADV_TT_CLIENT_DEL; /* if this global entry addition is due to a roaming, the node has to * mark the local entry as "roamed" in order to correctly reroute * packets later */ if (roaming) { flags |= BATADV_TT_CLIENT_ROAM; /* mark the local client as ROAMed */ tt_local_entry->common.flags |= BATADV_TT_CLIENT_ROAM; } if (!(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) { batadv_tt_local_set_pending(bat_priv, tt_local_entry, flags, message); goto out; } /* if this client has been added right now, it is possible to * immediately purge it */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local_entry->common); if (!tt_removed_node) goto out; /* drop reference of remove hash entry */ tt_removed_entry = hlist_entry(tt_removed_node, struct batadv_tt_local_entry, common.hash_entry); batadv_tt_local_entry_put(tt_removed_entry); out: batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } /** * batadv_tt_local_purge_list() - purge inactive tt local entries * @bat_priv: the bat priv with all the soft interface information * @head: pointer to the list containing the local tt entries * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv, struct hlist_head *head, int timeout) { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_common_entry *tt_common_entry; struct hlist_node *node_tmp; hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, common); if (tt_local_entry->common.flags & BATADV_TT_CLIENT_NOPURGE) continue; /* entry already marked for deletion */ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) continue; if (!batadv_has_timed_out(tt_local_entry->last_seen, timeout)) continue; batadv_tt_local_set_pending(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL, "timed out"); } } /** * batadv_tt_local_purge() - purge inactive tt local entries * @bat_priv: the bat priv with all the soft interface information * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ static void batadv_tt_local_purge(struct batadv_priv *bat_priv, int timeout) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); batadv_tt_local_purge_list(bat_priv, head, timeout); spin_unlock_bh(list_lock); } } static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->tt.local_hash) return; hash = bat_priv->tt.local_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { hlist_del_rcu(&tt_common_entry->hash_entry); tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); bat_priv->tt.local_hash = NULL; } static int batadv_tt_global_init(struct batadv_priv *bat_priv) { if (bat_priv->tt.global_hash) return 0; bat_priv->tt.global_hash = batadv_hash_new(1024); if (!bat_priv->tt.global_hash) return -ENOMEM; batadv_hash_set_lock_class(bat_priv->tt.global_hash, &batadv_tt_global_hash_lock_class_key); return 0; } static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_change_node *entry, *safe; spin_lock_bh(&bat_priv->tt.changes_list_lock); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); } atomic_set(&bat_priv->tt.local_changes, 0); spin_unlock_bh(&bat_priv->tt.changes_list_lock); } /** * batadv_tt_global_orig_entry_find() - find a TT orig_list_entry * @entry: the TT global entry where the orig_list_entry has to be * extracted from * @orig_node: the originator for which the orig_list_entry has to be found * * retrieve the orig_tt_list_entry belonging to orig_node from the * batadv_tt_global_entry list * * Return: it with an increased refcounter, NULL if not found */ static struct batadv_tt_orig_list_entry * batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, const struct batadv_orig_node *orig_node) { struct batadv_tt_orig_list_entry *tmp_orig_entry, *orig_entry = NULL; const struct hlist_head *head; rcu_read_lock(); head = &entry->orig_list; hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; if (!kref_get_unless_zero(&tmp_orig_entry->refcount)) continue; orig_entry = tmp_orig_entry; break; } rcu_read_unlock(); return orig_entry; } /** * batadv_tt_global_entry_has_orig() - check if a TT global entry is also * handled by a given originator * @entry: the TT global entry to check * @orig_node: the originator to search in the list * @flags: a pointer to store TT flags for the given @entry received * from @orig_node * * find out if an orig_node is already in the list of a tt_global_entry. * * Return: true if found, false otherwise */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, const struct batadv_orig_node *orig_node, u8 *flags) { struct batadv_tt_orig_list_entry *orig_entry; bool found = false; orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; if (flags) *flags = orig_entry->flags; batadv_tt_orig_list_entry_put(orig_entry); } return found; } /** * batadv_tt_global_sync_flags() - update TT sync flags * @tt_global: the TT global entry to update sync flags in * * Updates the sync flag bits in the tt_global flag attribute with a logical * OR of all sync flags from any of its TT orig entries. */ static void batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) { struct batadv_tt_orig_list_entry *orig_entry; const struct hlist_head *head; u16 flags = BATADV_NO_FLAGS; rcu_read_lock(); head = &tt_global->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) flags |= orig_entry->flags; rcu_read_unlock(); flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); tt_global->common.flags = flags; } /** * batadv_tt_global_orig_entry_add() - add or update a TT orig entry * @tt_global: the TT global entry to add an orig entry in * @orig_node: the originator to add an orig entry for * @ttvn: translation table version number of this changeset * @flags: TT sync flags */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, struct batadv_orig_node *orig_node, int ttvn, u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global->list_lock); orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; orig_entry->flags = flags; goto sync_flags; } orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); if (!orig_entry) goto out; INIT_HLIST_NODE(&orig_entry->list); kref_get(&orig_node->refcount); batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; orig_entry->flags = flags; kref_init(&orig_entry->refcount); kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); atomic_inc(&tt_global->orig_list_count); sync_flags: batadv_tt_global_sync_flags(tt_global); out: batadv_tt_orig_list_entry_put(orig_entry); spin_unlock_bh(&tt_global->list_lock); } /** * batadv_tt_global_add() - add a new TT global entry or update an existing one * @bat_priv: the bat priv with all the soft interface information * @orig_node: the originator announcing the client * @tt_addr: the mac address of the non-mesh client * @vid: VLAN identifier * @flags: TT flags that have to be set for this non-mesh client * @ttvn: the tt version number ever announcing this non-mesh client * * Add a new TT global entry for the given originator. If the entry already * exists add a new reference to the given originator (a global entry can have * references to multiple originators) and adjust the flags attribute to reflect * the function argument. * If a TT local entry exists for this non-mesh client remove it. * * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *tt_addr, unsigned short vid, u16 flags, u8 ttvn) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *tt_local_entry; bool ret = false; int hash_added; struct batadv_tt_common_entry *common; u16 local_flags; /* ignore global entries from backbone nodes */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) return true; tt_global_entry = batadv_tt_global_hash_find(bat_priv, tt_addr, vid); tt_local_entry = batadv_tt_local_hash_find(bat_priv, tt_addr, vid); /* if the node already has a local client for this entry, it has to wait * for a roaming advertisement instead of manually messing up the global * table */ if ((flags & BATADV_TT_CLIENT_TEMP) && tt_local_entry && !(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) goto out; if (!tt_global_entry) { tt_global_entry = kmem_cache_zalloc(batadv_tg_cache, GFP_ATOMIC); if (!tt_global_entry) goto out; common = &tt_global_entry->common; ether_addr_copy(common->addr, tt_addr); common->vid = vid; if (!is_multicast_ether_addr(common->addr)) common->flags = flags & (~BATADV_TT_SYNC_MASK); tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is * needed to purge this entry out on timeout (if nobody claims * it) */ if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; kref_init(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); atomic_set(&tt_global_entry->orig_list_count, 0); spin_lock_init(&tt_global_entry->list_lock); kref_get(&common->refcount); hash_added = batadv_hash_add(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, common, &common->hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_global_entry_put(tt_global_entry); goto out_remove; } } else { common = &tt_global_entry->common; /* If there is already a global entry, we can use this one for * our processing. * But if we are trying to add a temporary client then here are * two options at this point: * 1) the global client is not a temporary client: the global * client has to be left as it is, temporary information * should never override any already known client state * 2) the global client is a temporary client: purge the * originator list and add the new one orig_entry */ if (flags & BATADV_TT_CLIENT_TEMP) { if (!(common->flags & BATADV_TT_CLIENT_TEMP)) goto out; if (batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, NULL)) goto out_remove; batadv_tt_global_del_orig_list(tt_global_entry); goto add_orig_entry; } /* if the client was temporary added before receiving the first * OGM announcing it, we have to clear the TEMP flag. Also, * remove the previous temporary orig node and re-add it * if required. If the orig entry changed, the new one which * is a non-temporary entry is preferred. */ if (common->flags & BATADV_TT_CLIENT_TEMP) { batadv_tt_global_del_orig_list(tt_global_entry); common->flags &= ~BATADV_TT_CLIENT_TEMP; } /* the change can carry possible "attribute" flags like the * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ if (!is_multicast_ether_addr(common->addr)) common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a * delete + roaming change for this originator. * * We should first delete the old originator before adding the * new one. */ if (common->flags & BATADV_TT_CLIENT_ROAM) { batadv_tt_global_del_orig_list(tt_global_entry); common->flags &= ~BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = 0; } } add_orig_entry: /* add the new orig_entry (if needed) or update it */ batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", common->addr, batadv_print_vid(common->vid), orig_node->orig); ret = true; out_remove: /* Do not remove multicast addresses from the local hash on * global additions */ if (is_multicast_ether_addr(tt_addr)) goto out; /* remove address from local hash if present */ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, vid, "global tt received", flags & BATADV_TT_CLIENT_ROAM); tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI; if (!(flags & BATADV_TT_CLIENT_ROAM)) /* this is a normal global add. Therefore the client is not in a * roaming state anymore. */ tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM; out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(tt_local_entry); return ret; } /** * batadv_transtable_best_orig() - Get best originator list entry from tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * batadv_transtable_best_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry) { struct batadv_neigh_node *router, *best_router = NULL; struct batadv_algo_ops *bao = bat_priv->algo_ops; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL; head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { router = batadv_orig_router_get(orig_entry->orig_node, BATADV_IF_DEFAULT); if (!router) continue; if (best_router && bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router, BATADV_IF_DEFAULT) <= 0) { batadv_neigh_node_put(router); continue; } /* release the refcount for the "old" best */ batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } batadv_neigh_node_put(best_router); return best_entry; } /** * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @common: tt local & tt global common data * @orig: Originator node announcing a non-mesh client * @best: Is the best originator for the TT entry * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_tt_common_entry *common, struct batadv_tt_orig_list_entry *orig, bool best) { u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags; void *hdr; struct batadv_orig_node_vlan *vlan; u8 last_ttvn; u32 crc; vlan = batadv_orig_node_vlan_get(orig->orig_node, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; batadv_orig_node_vlan_put(vlan); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_GLOBAL); if (!hdr) return -ENOBUFS; last_ttvn = atomic_read(&orig->orig_node->last_ttvn); if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) || nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_tt_global_dump_entry() - Dump one TT global entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * @sub_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common, int *sub_s) { struct batadv_tt_orig_list_entry *orig_entry, *best_entry; struct batadv_tt_global_entry *global; struct hlist_head *head; int sub = 0; bool best; global = container_of(common, struct batadv_tt_global_entry, common); best_entry = batadv_transtable_best_orig(bat_priv, global); head = &global->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { if (sub++ < *sub_s) continue; best = (orig_entry == best_entry); if (batadv_tt_global_dump_subentry(msg, portid, seq, common, orig_entry, best)) { *sub_s = sub - 1; return -EMSGSIZE; } } *sub_s = 0; return 0; } /** * batadv_tt_global_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @head: Pointer to the list containing the global tt entries * @idx_s: Number of entries to skip * @sub: Number of entries to skip * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_tt_common_entry *common; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(common, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv, common, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_tt_global_dump() - Dump TT global entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: Error code, or length of message on success */ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; struct hlist_head *head; int ret; int ifindex; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } hash = bat_priv->tt.global_hash; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_tt_global_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, head, &idx, &sub)) break; bucket++; } ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; return ret; } /** * _batadv_tt_global_del_orig_entry() - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from * @orig_entry: the orig entry to remove and free * * Remove an orig_entry from its list in the given tt_global_entry and * free this orig_entry afterwards. * * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is * part of a list. */ static void _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, struct batadv_tt_orig_list_entry *orig_entry) { lockdep_assert_held(&tt_global_entry->list_lock); batadv_tt_global_size_dec(orig_entry->orig_node, tt_global_entry->common.vid); atomic_dec(&tt_global_entry->orig_list_count); /* requires holding tt_global_entry->list_lock and orig_entry->list * being part of a list */ hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_put(orig_entry); } /* deletes the orig list of a tt_global_entry */ static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) { struct hlist_head *head; struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); spin_unlock_bh(&tt_global_entry->list_lock); } /** * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: the global entry to remove the orig_node from * @orig_node: the originator announcing the client * @message: message to append to the log on deletion * * Remove the given orig_node and its according orig_entry from the given * global tt entry. */ static void batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry, struct batadv_orig_node *orig_node, const char *message) { struct hlist_head *head; struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; unsigned short vid; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) { if (orig_entry->orig_node == orig_node) { vid = tt_global_entry->common.vid; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting %pM from global tt entry %pM (vid: %d): %s\n", orig_node->orig, tt_global_entry->common.addr, batadv_print_vid(vid), message); _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); } } spin_unlock_bh(&tt_global_entry->list_lock); } /* If the client is to be deleted, we check if it is the last origantor entry * within tt_global entry. If yes, we set the BATADV_TT_CLIENT_ROAM flag and the * timer, otherwise we simply remove the originator scheduled for deletion. */ static void batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry, struct batadv_orig_node *orig_node, const char *message) { bool last_entry = true; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; /* no local entry exists, case 1: * Check if this is the last one or if other entries exist. */ rcu_read_lock(); head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { if (orig_entry->orig_node != orig_node) { last_entry = false; break; } } rcu_read_unlock(); if (last_entry) { /* its the last one, mark for roaming. */ tt_global_entry->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = jiffies; } else { /* there is another entry, we can simply delete this * one and can still use the other one. */ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); } } /** * batadv_tt_global_del() - remove a client from the global table * @bat_priv: the bat priv with all the soft interface information * @orig_node: an originator serving this client * @addr: the mac address of the client * @vid: VLAN identifier * @message: a message explaining the reason for deleting the client to print * for debugging purpose * @roaming: true if the deletion has been triggered by a roaming event */ static void batadv_tt_global_del(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid, const char *message, bool roaming) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *local_entry = NULL; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; if (!roaming) { batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); if (hlist_empty(&tt_global_entry->orig_list)) batadv_tt_global_free(bat_priv, tt_global_entry, message); goto out; } /* if we are deleting a global entry due to a roam * event, there are two possibilities: * 1) the client roamed from node A to node B => if there * is only one originator left for this client, we mark * it with BATADV_TT_CLIENT_ROAM, we start a timer and we * wait for node B to claim it. In case of timeout * the entry is purged. * * If there are other originators left, we directly delete * the originator. * 2) the client roamed to us => we can directly delete * the global entry, since it is useless now. */ local_entry = batadv_tt_local_hash_find(bat_priv, tt_global_entry->common.addr, vid); if (local_entry) { /* local entry exists, case 2: client roamed to us. */ batadv_tt_global_del_orig_list(tt_global_entry); batadv_tt_global_free(bat_priv, tt_global_entry, message); } else { /* no local entry exists, case 1: check for roaming */ batadv_tt_global_del_roaming(bat_priv, tt_global_entry, orig_node, message); } out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(local_entry); } /** * batadv_tt_global_del_orig() - remove all the TT global entries belonging to * the given originator matching the provided vid * @bat_priv: the bat priv with all the soft interface information * @orig_node: the originator owning the entries to remove * @match_vid: the VLAN identifier to match. If negative all the entries will be * removed * @message: debug message to print as "reason" */ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message) { struct batadv_tt_global_entry *tt_global; struct batadv_tt_common_entry *tt_common_entry; u32 i; struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_node *safe; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ unsigned short vid; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, safe, head, hash_entry) { /* remove only matching entries */ if (match_vid >= 0 && tt_common_entry->vid != match_vid) continue; tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); batadv_tt_global_del_orig_node(bat_priv, tt_global, orig_node, message); if (hlist_empty(&tt_global->orig_list)) { vid = tt_global->common.vid; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); batadv_tt_global_entry_put(tt_global); } } spin_unlock_bh(list_lock); } clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, char **msg) { bool purge = false; unsigned long roam_timeout = BATADV_TT_CLIENT_ROAM_TIMEOUT; unsigned long temp_timeout = BATADV_TT_CLIENT_TEMP_TIMEOUT; if ((tt_global->common.flags & BATADV_TT_CLIENT_ROAM) && batadv_has_timed_out(tt_global->roam_at, roam_timeout)) { purge = true; *msg = "Roaming timeout\n"; } if ((tt_global->common.flags & BATADV_TT_CLIENT_TEMP) && batadv_has_timed_out(tt_global->common.added_at, temp_timeout)) { purge = true; *msg = "Temporary client timeout\n"; } return purge; } static void batadv_tt_global_purge(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_head *head; struct hlist_node *node_tmp; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; char *msg = NULL; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); if (!batadv_tt_global_to_purge(tt_global, &msg)) continue; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), msg); hlist_del_rcu(&tt_common->hash_entry); batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } } static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->tt.global_hash) return; hash = bat_priv->tt.global_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { hlist_del_rcu(&tt_common_entry->hash_entry); tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); bat_priv->tt.global_hash = NULL; } static bool _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, struct batadv_tt_global_entry *tt_global_entry) { if (tt_local_entry->common.flags & BATADV_TT_CLIENT_WIFI && tt_global_entry->common.flags & BATADV_TT_CLIENT_WIFI) return true; /* check if the two clients are marked as isolated */ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_ISOLA && tt_global_entry->common.flags & BATADV_TT_CLIENT_ISOLA) return true; return false; } /** * batadv_transtable_search() - get the mesh destination for a given client * @bat_priv: the bat priv with all the soft interface information * @src: mac address of the source client * @addr: mac address of the destination client * @vid: VLAN identifier * * Return: a pointer to the originator that was selected as destination in the * mesh for contacting the client 'addr', NULL otherwise. * In case of multiple originators serving the same client, the function returns * the best one (best in terms of metric towards the destination node). * * If the two clients are AP isolated the function returns NULL. */ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; struct batadv_tt_global_entry *tt_global_entry = NULL; struct batadv_orig_node *orig_node = NULL; struct batadv_tt_orig_list_entry *best_entry; if (src && batadv_vlan_ap_isola_get(bat_priv, vid)) { tt_local_entry = batadv_tt_local_hash_find(bat_priv, src, vid); if (!tt_local_entry || (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING)) goto out; } tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; /* check whether the clients should not communicate due to AP * isolation */ if (tt_local_entry && _batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) goto out; rcu_read_lock(); best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry); /* found anything? */ if (best_entry) orig_node = best_entry->orig_node; if (orig_node && !kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; rcu_read_unlock(); out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(tt_local_entry); return orig_node; } /** * batadv_tt_global_crc() - calculates the checksum of the local table belonging * to the given orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator for which the CRC should be computed * @vid: VLAN identifier for which the CRC32 has to be computed * * This function computes the checksum for the global table corresponding to a * specific originator. In particular, the checksum is computed as follows: For * each client connected to the originator the CRC32C of the MAC address and the * VID is computed and then all the CRC32Cs of the various clients are xor'ed * together. * * The idea behind is that CRC32C should be used as much as possible in order to * produce a unique hash of the table, but since the order which is used to feed * the CRC32C function affects the result and since every node in the network * probably sorts the clients differently, the hash function cannot be directly * computed over the entire table. Hence the CRC32C is used only on * the single client entry, while all the results are then xor'ed together * because the XOR operation can combine them all while trying to reduce the * noise as much as possible. * * Return: the checksum of the global table of a given originator. */ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; u32 i, crc_tmp, crc = 0; u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); /* compute the CRC only for entries belonging to the * VLAN identified by the vid passed as parameter */ if (tt_common->vid != vid) continue; /* Roaming clients are in the global table for * consistency only. They don't have to be * taken into account while computing the * global crc */ if (tt_common->flags & BATADV_TT_CLIENT_ROAM) continue; /* Temporary clients have not been announced yet, so * they have to be skipped while computing the global * crc */ if (tt_common->flags & BATADV_TT_CLIENT_TEMP) continue; /* find out if this global entry is announced by this * originator */ tt_orig = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (!tt_orig) continue; /* use network order to read the VID: this ensures that * every node reads the bytes in the same order. */ tmp_vid = htons(tt_common->vid); crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes */ flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); batadv_tt_orig_list_entry_put(tt_orig); } rcu_read_unlock(); } return crc; } /** * batadv_tt_local_crc() - calculates the checksum of the local table * @bat_priv: the bat priv with all the soft interface information * @vid: VLAN identifier for which the CRC32 has to be computed * * For details about the computation, please refer to the documentation for * batadv_tt_global_crc(). * * Return: the checksum of the local table */ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct hlist_head *head; u32 i, crc_tmp, crc = 0; u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common, head, hash_entry) { /* compute the CRC only for entries belonging to the * VLAN identified by vid */ if (tt_common->vid != vid) continue; /* not yet committed clients have not to be taken into * account while computing the CRC */ if (tt_common->flags & BATADV_TT_CLIENT_NEW) continue; /* use network order to read the VID: this ensures that * every node reads the bytes in the same order. */ tmp_vid = htons(tt_common->vid); crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes */ flags = tt_common->flags & BATADV_TT_SYNC_MASK; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); } rcu_read_unlock(); } return crc; } /** * batadv_tt_req_node_release() - free tt_req node entry * @ref: kref pointer of the tt req_node entry */ static void batadv_tt_req_node_release(struct kref *ref) { struct batadv_tt_req_node *tt_req_node; tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); kmem_cache_free(batadv_tt_req_cache, tt_req_node); } /** * batadv_tt_req_node_put() - decrement the tt_req_node refcounter and * possibly release it * @tt_req_node: tt_req_node to be free'd */ static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) { if (!tt_req_node) return; kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); } static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { hlist_del_init(&node->list); batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); } static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, u16 tt_buff_len) { /* Replace the old buffer only if I received something in the * last OGM (the OGM could carry no changes) */ spin_lock_bh(&orig_node->tt_buff_lock); if (tt_buff_len > 0) { kfree(orig_node->tt_buff); orig_node->tt_buff_len = 0; orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC); if (orig_node->tt_buff) { memcpy(orig_node->tt_buff, tt_buff, tt_buff_len); orig_node->tt_buff_len = tt_buff_len; } } spin_unlock_bh(&orig_node->tt_buff_lock); } static void batadv_tt_req_purge(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { hlist_del_init(&node->list); batadv_tt_req_node_put(node); } } spin_unlock_bh(&bat_priv->tt.req_list_lock); } /** * batadv_tt_req_node_new() - search and possibly create a tt_req_node object * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node this request is being issued for * * Return: the pointer to the new tt_req_node struct if no request * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * batadv_tt_req_node_new(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { if (batadv_compare_eth(tt_req_node_tmp, orig_node) && !batadv_has_timed_out(tt_req_node_tmp->issued_at, BATADV_TT_REQUEST_TIMEOUT)) goto unlock; } tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC); if (!tt_req_node) goto unlock; kref_init(&tt_req_node->refcount); ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; kref_get(&tt_req_node->refcount); hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); return tt_req_node; } /** * batadv_tt_local_valid() - verify local tt entry and get flags * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * @flags: a pointer to store TT flags for this client to * * Checks the validity of the given local TT entry. If it is, then the provided * flags pointer is updated. * * Return: true if the entry is a valid, false otherwise. */ static bool batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr, u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW) return false; if (flags) *flags = tt_common_entry->flags; return true; } /** * batadv_tt_global_valid() - verify global tt entry and get flags * @entry_ptr: to be checked global tt entry * @data_ptr: an orig_node object (may be NULL) * @flags: a pointer to store TT flags for this client to * * Checks the validity of the given global TT entry. If it is, then the provided * flags pointer is updated either with the common (summed) TT flags if data_ptr * is NULL or the specific, per originator TT flags otherwise. * * Return: true if the entry is a valid, false otherwise. */ static bool batadv_tt_global_valid(const void *entry_ptr, const void *data_ptr, u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; const struct batadv_tt_global_entry *tt_global_entry; const struct batadv_orig_node *orig_node = data_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM || tt_common_entry->flags & BATADV_TT_CLIENT_TEMP) return false; tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, common); return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, flags); } /** * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the * specified tt hash * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the tt entries * @tt_len: expected tvlv tt data buffer length in number of bytes * @tvlv_buff: pointer to the buffer to fill with the TT data * @valid_cb: function to filter tt change entries and to return TT flags * @cb_data: data passed to the filter function as argument * * Fills the tvlv buff with the tt entries from the specified hash. If valid_cb * is not provided then this becomes a no-op. */ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, void *tvlv_buff, u16 tt_len, bool (*valid_cb)(const void *, const void *, u8 *flags), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; struct hlist_head *head; u16 tt_tot, tt_num_entries = 0; u8 flags; bool ret; u32 i; tt_tot = batadv_tt_entries(tt_len); tt_change = tvlv_buff; if (!valid_cb) return; rcu_read_lock(); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (tt_tot == tt_num_entries) break; ret = valid_cb(tt_common_entry, cb_data, &flags); if (!ret) continue; ether_addr_copy(tt_change->addr, tt_common_entry->addr); tt_change->flags = flags; tt_change->vid = htons(tt_common_entry->vid); memset(tt_change->reserved, 0, sizeof(tt_change->reserved)); tt_num_entries++; tt_change++; } } rcu_read_unlock(); } /** * batadv_tt_global_check_crc() - check if all the CRCs are correct * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries * * Return: true if all the received CRCs match the locally stored ones, false * otherwise */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_vlan_data *tt_vlan, u16 num_vlan) { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; int i, orig_num_vlan; u32 crc; /* check if each received CRC matches the locally stored one */ for (i = 0; i < num_vlan; i++) { tt_vlan_tmp = tt_vlan + i; /* if orig_node is a backbone node for this VLAN, don't check * the CRC as we ignore all the global entries over it */ if (batadv_bla_is_backbone_gw_orig(orig_node->bat_priv, orig_node->orig, ntohs(tt_vlan_tmp->vid))) continue; vlan = batadv_orig_node_vlan_get(orig_node, ntohs(tt_vlan_tmp->vid)); if (!vlan) return false; crc = vlan->tt.crc; batadv_orig_node_vlan_put(vlan); if (crc != ntohl(tt_vlan_tmp->crc)) return false; } /* check if any excess VLANs exist locally for the originator * which are not mentioned in the TVLV from the originator. */ rcu_read_lock(); orig_num_vlan = 0; hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) orig_num_vlan++; rcu_read_unlock(); if (orig_num_vlan > num_vlan) return false; return true; } /** * batadv_tt_local_update_crc() - update all the local CRCs * @bat_priv: the bat priv with all the soft interface information */ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) { struct batadv_softif_vlan *vlan; /* recompute the global CRC for each VLAN */ rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid); } rcu_read_unlock(); } /** * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig_node for which the CRCs have to be updated */ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node_vlan *vlan; u32 crc; /* recompute the global CRC for each VLAN */ rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { /* if orig_node is a backbone node for this VLAN, don't compute * the CRC as we ignore all the global entries over it */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vlan->vid)) continue; crc = batadv_tt_global_crc(bat_priv, orig_node, vlan->vid); vlan->tt.crc = crc; } rcu_read_unlock(); } /** * batadv_send_tt_request() - send a TT Request message to a given node * @bat_priv: the bat priv with all the soft interface information * @dst_orig_node: the destination of the message * @ttvn: the version number that the source of the message is looking for * @tt_vlan: pointer to the first tvlv VLAN object to request * @num_vlan: number of tvlv VLAN entries * @full_table: ask for the entire translation table if true, while only for the * last TT diff otherwise * * Return: true if the TT Request was sent, false otherwise */ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, u8 ttvn, struct batadv_tvlv_tt_vlan_data *tt_vlan, u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; struct batadv_tvlv_tt_vlan_data *tt_vlan_req; struct batadv_hard_iface *primary_if; bool ret = false; int i, size; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node); if (!tt_req_node) goto out; size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan; tvlv_tt_data = kzalloc(size, GFP_ATOMIC); if (!tvlv_tt_data) goto out; tvlv_tt_data->flags = BATADV_TT_REQUEST; tvlv_tt_data->ttvn = ttvn; tvlv_tt_data->num_vlan = htons(num_vlan); /* send all the CRCs within the request. This is needed by intermediate * nodes to ensure they have the correct table before replying */ tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1); for (i = 0; i < num_vlan; i++) { tt_vlan_req->vid = tt_vlan->vid; tt_vlan_req->crc = tt_vlan->crc; tt_vlan_req++; tt_vlan++; } if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_REQUEST to %pM [%c]\n", dst_orig_node->orig, full_table ? 'F' : '.'); batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_TX); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, dst_orig_node->orig, BATADV_TVLV_TT, 1, tvlv_tt_data, size); ret = true; out: batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); if (!hlist_unhashed(&tt_req_node->list)) { hlist_del_init(&tt_req_node->list); batadv_tt_req_node_put(tt_req_node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); } batadv_tt_req_node_put(tt_req_node); kfree(tvlv_tt_data); return ret; } /** * batadv_send_other_tt_response() - send reply to tt request concerning another * node's translation table * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src, u8 *req_dst) { struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tvlv_tt_vlan_data *tt_vlan; bool ret = false, full_table; u8 orig_ttvn, req_ttvn; u16 tvlv_len; s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", req_src, tt_data->ttvn, req_dst, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst); if (!req_dst_orig_node) goto out; res_dst_orig_node = batadv_orig_hash_find(bat_priv, req_src); if (!res_dst_orig_node) goto out; orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); /* this node doesn't have the requested data */ if (orig_ttvn != req_ttvn || !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan, ntohs(tt_data->num_vlan))) goto out; /* If the full table has been explicitly requested */ if (tt_data->flags & BATADV_TT_FULL_TABLE || !req_dst_orig_node->tt_buff) full_table = true; else full_table = false; /* TT fragmentation hasn't been implemented yet, so send as many * TT entries fit a single packet as possible only */ if (!full_table) { spin_lock_bh(&req_dst_orig_node->tt_buff_lock); tt_len = req_dst_orig_node->tt_buff_len; tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len) goto unlock; /* Copy the last orig_node's OGM buffer */ memcpy(tt_change, req_dst_orig_node->tt_buff, req_dst_orig_node->tt_buff_len); spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); } else { /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part */ tt_len = -1; tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len) goto out; /* fill the rest of the tvlv with the real TT entries */ batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.global_hash, tt_change, tt_len, batadv_tt_global_valid, req_dst_orig_node); } /* Don't send the response, if larger than fragmented packet. */ tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len; if (tt_len > atomic_read(&bat_priv->packet_size_max)) { net_ratelimited_function(batadv_info, bat_priv->soft_iface, "Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n", res_dst_orig_node->orig); goto out; } tvlv_tt_data->flags = BATADV_TT_RESPONSE; tvlv_tt_data->ttvn = req_ttvn; if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_RESPONSE %pM for %pM [%c] (ttvn: %u)\n", res_dst_orig_node->orig, req_dst_orig_node->orig, full_table ? 'F' : '.', req_ttvn); batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); batadv_tvlv_unicast_send(bat_priv, req_dst_orig_node->orig, req_src, BATADV_TVLV_TT, 1, tvlv_tt_data, tvlv_len); ret = true; goto out; unlock: spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); out: batadv_orig_node_put(res_dst_orig_node); batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } /** * batadv_send_my_tt_response() - send reply to tt request concerning this * node's translation table * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_hard_iface *primary_if = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_orig_node *orig_node; u8 my_ttvn, req_ttvn; u16 tvlv_len; bool full_table; s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", req_src, tt_data->ttvn, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); spin_lock_bh(&bat_priv->tt.commit_lock); my_ttvn = (u8)atomic_read(&bat_priv->tt.vn); req_ttvn = tt_data->ttvn; orig_node = batadv_orig_hash_find(bat_priv, req_src); if (!orig_node) goto out; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* If the full table has been explicitly requested or the gap * is too big send the whole local translation table */ if (tt_data->flags & BATADV_TT_FULL_TABLE || my_ttvn != req_ttvn || !bat_priv->tt.last_changeset) full_table = true; else full_table = false; /* TT fragmentation hasn't been implemented yet, so send as many * TT entries fit a single packet as possible only */ if (!full_table) { spin_lock_bh(&bat_priv->tt.last_changeset_lock); tt_len = bat_priv->tt.last_changeset_len; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len || !tvlv_len) goto unlock; /* Copy the last orig_node's OGM buffer */ memcpy(tt_change, bat_priv->tt.last_changeset, bat_priv->tt.last_changeset_len); spin_unlock_bh(&bat_priv->tt.last_changeset_lock); } else { req_ttvn = (u8)atomic_read(&bat_priv->tt.vn); /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part */ tt_len = -1; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len || !tvlv_len) goto out; /* fill the rest of the tvlv with the real TT entries */ batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.local_hash, tt_change, tt_len, batadv_tt_local_valid, NULL); } tvlv_tt_data->flags = BATADV_TT_RESPONSE; tvlv_tt_data->ttvn = req_ttvn; if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_RESPONSE to %pM [%c] (ttvn: %u)\n", orig_node->orig, full_table ? 'F' : '.', req_ttvn); batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, req_src, BATADV_TVLV_TT, 1, tvlv_tt_data, tvlv_len); goto out; unlock: spin_unlock_bh(&bat_priv->tt.last_changeset_lock); out: spin_unlock_bh(&bat_priv->tt.commit_lock); batadv_orig_node_put(orig_node); batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; } /** * batadv_send_tt_response() - send reply to tt request * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src, u8 *req_dst) { if (batadv_is_my_mac(bat_priv, req_dst)) return batadv_send_my_tt_response(bat_priv, tt_data, req_src); return batadv_send_other_tt_response(bat_priv, tt_data, req_src, req_dst); } static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_change *tt_change, u16 tt_num_changes, u8 ttvn) { int i; int roams; for (i = 0; i < tt_num_changes; i++) { if ((tt_change + i)->flags & BATADV_TT_CLIENT_DEL) { roams = (tt_change + i)->flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_del(bat_priv, orig_node, (tt_change + i)->addr, ntohs((tt_change + i)->vid), "tt removed by changes", roams); } else { if (!batadv_tt_global_add(bat_priv, orig_node, (tt_change + i)->addr, ntohs((tt_change + i)->vid), (tt_change + i)->flags, ttvn)) /* In case of problem while storing a * global_entry, we stop the updating * procedure without committing the * ttvn change. This will avoid to send * corrupted data on tt_request */ return; } } set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change, u8 ttvn, u8 *resp_src, u16 num_entries) { struct batadv_orig_node *orig_node; orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) goto out; /* Purge the old table first.. */ batadv_tt_global_del_orig(bat_priv, orig_node, -1, "Received full table"); _batadv_tt_update_changes(bat_priv, orig_node, tt_change, num_entries, ttvn); spin_lock_bh(&orig_node->tt_buff_lock); kfree(orig_node->tt_buff); orig_node->tt_buff_len = 0; orig_node->tt_buff = NULL; spin_unlock_bh(&orig_node->tt_buff_lock); atomic_set(&orig_node->last_ttvn, ttvn); out: batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, u16 tt_num_changes, u8 ttvn, struct batadv_tvlv_tt_change *tt_change) { _batadv_tt_update_changes(bat_priv, orig_node, tt_change, tt_num_changes, ttvn); batadv_tt_save_orig_buffer(bat_priv, orig_node, tt_change, batadv_tt_len(tt_num_changes)); atomic_set(&orig_node->last_ttvn, ttvn); } /** * batadv_is_my_client() - check if a client is served by the local node * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * * Return: true if the client is served by this node, false otherwise. */ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; /* Check if the client has been logically deleted (but is kept for * consistency purpose) */ if ((tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) || (tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM)) goto out; ret = true; out: batadv_tt_local_entry_put(tt_local_entry); return ret; } /** * batadv_handle_tt_response() - process incoming tt reply * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @resp_src: mac address of tt reply sender * @num_entries: number of tt change entries appended to the tt data */ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *resp_src, u16 num_entries) { struct batadv_tt_req_node *node; struct hlist_node *safe; struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; u8 *tvlv_ptr = (u8 *)tt_data; u16 change_offset; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", resp_src, tt_data->ttvn, num_entries, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) goto out; spin_lock_bh(&orig_node->tt_lock); change_offset = sizeof(struct batadv_tvlv_tt_vlan_data); change_offset *= ntohs(tt_data->num_vlan); change_offset += sizeof(*tt_data); tvlv_ptr += change_offset; tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr; if (tt_data->flags & BATADV_TT_FULL_TABLE) { batadv_tt_fill_gtable(bat_priv, tt_change, tt_data->ttvn, resp_src, num_entries); } else { batadv_tt_update_changes(bat_priv, orig_node, num_entries, tt_data->ttvn, tt_change); } /* Recalculate the CRC for this orig_node and store it */ batadv_tt_global_update_crc(bat_priv, orig_node); spin_unlock_bh(&orig_node->tt_lock); /* Delete the tt_req_node from pending tt_requests list */ spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (!batadv_compare_eth(node->addr, resp_src)) continue; hlist_del_init(&node->list); batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); out: batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_roam_node *node, *safe; spin_lock_bh(&bat_priv->tt.roam_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { list_del(&node->list); kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) { struct batadv_tt_roam_node *node, *safe; spin_lock_bh(&bat_priv->tt.roam_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { if (!batadv_has_timed_out(node->first_time, BATADV_ROAMING_MAX_TIME)) continue; list_del(&node->list); kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } /** * batadv_tt_check_roam_count() - check if a client has roamed too frequently * @bat_priv: the bat priv with all the soft interface information * @client: mac address of the roaming client * * This function checks whether the client already reached the * maximum number of possible roaming phases. In this case the ROAMING_ADV * will not be sent. * * Return: true if the ROAMING_ADV can be sent, false otherwise */ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { struct batadv_tt_roam_node *tt_roam_node; bool ret = false; spin_lock_bh(&bat_priv->tt.roam_list_lock); /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ list_for_each_entry(tt_roam_node, &bat_priv->tt.roam_list, list) { if (!batadv_compare_eth(tt_roam_node->addr, client)) continue; if (batadv_has_timed_out(tt_roam_node->first_time, BATADV_ROAMING_MAX_TIME)) continue; if (!batadv_atomic_dec_not_zero(&tt_roam_node->counter)) /* Sorry, you roamed too many times! */ goto unlock; ret = true; break; } if (!ret) { tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache, GFP_ATOMIC); if (!tt_roam_node) goto unlock; tt_roam_node->first_time = jiffies; atomic_set(&tt_roam_node->counter, BATADV_ROAMING_MAX_COUNT - 1); ether_addr_copy(tt_roam_node->addr, client); list_add(&tt_roam_node->list, &bat_priv->tt.roam_list); ret = true; } unlock: spin_unlock_bh(&bat_priv->tt.roam_list_lock); return ret; } /** * batadv_send_roam_adv() - send a roaming advertisement message * @bat_priv: the bat priv with all the soft interface information * @client: mac address of the roaming client * @vid: VLAN identifier * @orig_node: message destination * * Send a ROAMING_ADV message to the node which was previously serving this * client. This is done to inform the node that from now on all traffic destined * for this particular roamed client has to be forwarded to the sender of the * roaming message. */ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node) { struct batadv_hard_iface *primary_if; struct batadv_tvlv_roam_adv tvlv_roam; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* before going on we have to check whether the client has * already roamed to us too many times */ if (!batadv_tt_check_roam_count(bat_priv, client)) goto out; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending ROAMING_ADV to %pM (client %pM, vid: %d)\n", orig_node->orig, client, batadv_print_vid(vid)); batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX); memcpy(tvlv_roam.client, client, sizeof(tvlv_roam.client)); tvlv_roam.vid = htons(vid); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, orig_node->orig, BATADV_TVLV_ROAM, 1, &tvlv_roam, sizeof(tvlv_roam)); out: batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_tt *priv_tt; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_tt = container_of(delayed_work, struct batadv_priv_tt, work); bat_priv = container_of(priv_tt, struct batadv_priv, tt); batadv_tt_local_purge(bat_priv, BATADV_TT_LOCAL_TIMEOUT); batadv_tt_global_purge(bat_priv); batadv_tt_req_purge(bat_priv); batadv_tt_roam_purge(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); } /** * batadv_tt_free() - Free translation table of soft interface * @bat_priv: the bat priv with all the soft interface information */ void batadv_tt_free(struct batadv_priv *bat_priv) { batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1); cancel_delayed_work_sync(&bat_priv->tt.work); batadv_tt_local_table_free(bat_priv); batadv_tt_global_table_free(bat_priv); batadv_tt_req_list_free(bat_priv); batadv_tt_changes_list_free(bat_priv); batadv_tt_roam_list_free(bat_priv); kfree(bat_priv->tt.last_changeset); } /** * batadv_tt_local_set_flags() - set or unset the specified flags on the local * table and possibly count them in the TT size * @bat_priv: the bat priv with all the soft interface information * @flags: the flag to switch * @enable: whether to set or unset the flag * @count: whether to increase the TT size by the number of changed entries */ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, bool enable, bool count) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; struct hlist_head *head; u32 i; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (enable) { if ((tt_common_entry->flags & flags) == flags) continue; tt_common_entry->flags |= flags; } else { if (!(tt_common_entry->flags & flags)) continue; tt_common_entry->flags &= ~flags; } if (!count) continue; batadv_tt_local_size_inc(bat_priv, tt_common_entry->vid); } rcu_read_unlock(); } } /* Purge out all the tt local entries marked with BATADV_TT_CLIENT_PENDING */ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { if (!(tt_common->flags & BATADV_TT_CLIENT_PENDING)) continue; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting local tt entry (%pM, vid: %d): pending\n", tt_common->addr, batadv_print_vid(tt_common->vid)); batadv_tt_local_size_dec(bat_priv, tt_common->vid); hlist_del_rcu(&tt_common->hash_entry); tt_local = container_of(tt_common, struct batadv_tt_local_entry, common); batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } } /** * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes * which have been queued in the time since the last commit * @bat_priv: the bat priv with all the soft interface information * * Caller must hold tt->commit_lock. */ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { lockdep_assert_held(&bat_priv->tt.commit_lock); if (atomic_read(&bat_priv->tt.local_changes) < 1) { if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt)) batadv_tt_tvlv_container_update(bat_priv); return; } batadv_tt_local_set_flags(bat_priv, BATADV_TT_CLIENT_NEW, false, true); batadv_tt_local_purge_pending_clients(bat_priv); batadv_tt_local_update_crc(bat_priv); /* Increment the TTVN only once per OGM interval */ atomic_inc(&bat_priv->tt.vn); batadv_dbg(BATADV_DBG_TT, bat_priv, "Local changes committed, updating to ttvn %u\n", (u8)atomic_read(&bat_priv->tt.vn)); /* reset the sending counter */ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX); batadv_tt_tvlv_container_update(bat_priv); } /** * batadv_tt_local_commit_changes() - commit all pending local tt changes which * have been queued in the time since the last commit * @bat_priv: the bat priv with all the soft interface information */ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) { spin_lock_bh(&bat_priv->tt.commit_lock); batadv_tt_local_commit_changes_nolock(bat_priv); spin_unlock_bh(&bat_priv->tt.commit_lock); } /** * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped * @bat_priv: the bat priv with all the soft interface information * @src: source mac address of packet * @dst: destination mac address of packet * @vid: vlan id of packet * * Return: true when src+dst(+vid) pair should be isolated, false otherwise */ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_global_entry *tt_global_entry; struct batadv_softif_vlan *vlan; bool ret = false; vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return false; if (!atomic_read(&vlan->ap_isolation)) goto vlan_put; tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid); if (!tt_local_entry) goto vlan_put; tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid); if (!tt_global_entry) goto local_entry_put; if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) ret = true; batadv_tt_global_entry_put(tt_global_entry); local_entry_put: batadv_tt_local_entry_put(tt_local_entry); vlan_put: batadv_softif_vlan_put(vlan); return ret; } /** * batadv_tt_update_orig() - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig_node of the ogm * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries * @tt_change: pointer to the first entry in the TT buffer * @tt_num_changes: number of tt changes inside the tt buffer * @ttvn: translation table version number of this changeset */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, u16 tt_num_vlan, struct batadv_tvlv_tt_change *tt_change, u16 tt_num_changes, u8 ttvn) { u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); struct batadv_tvlv_tt_vlan_data *tt_vlan; bool full_table = true; bool has_tt_init; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff; has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); /* orig table not initialised AND first diff is in the OGM OR the ttvn * increased by one -> we can apply the attached changes */ if ((!has_tt_init && ttvn == 1) || ttvn - orig_ttvn == 1) { /* the OGM could not contain the changes due to their size or * because they have already been sent BATADV_TT_OGM_APPEND_MAX * times. * In this case send a tt request */ if (!tt_num_changes) { full_table = false; goto request_table; } spin_lock_bh(&orig_node->tt_lock); batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes, ttvn, tt_change); /* Even if we received the precomputed crc with the OGM, we * prefer to recompute it to spot any possible inconsistency * in the global table */ batadv_tt_global_update_crc(bat_priv, orig_node); spin_unlock_bh(&orig_node->tt_lock); /* The ttvn alone is not enough to guarantee consistency * because a single value could represent different states * (due to the wrap around). Thus a node has to check whether * the resulting table (after applying the changes) is still * consistent or not. E.g. a node could disconnect while its * ttvn is X and reconnect on ttvn = X + TTVN_MAX: in this case * checking the CRC value is mandatory to detect the * inconsistency */ if (!batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) goto request_table; } else { /* if we missed more than one change or our tables are not * in sync anymore -> request fresh tt data */ if (!has_tt_init || ttvn != orig_ttvn || !batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) { request_table: batadv_dbg(BATADV_DBG_TT, bat_priv, "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u num_changes: %u)\n", orig_node->orig, ttvn, orig_ttvn, tt_num_changes); batadv_send_tt_request(bat_priv, orig_node, ttvn, tt_vlan, tt_num_vlan, full_table); return; } } } /** * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * * Return: true if we know that the client has moved from its old originator * to another one. This entry is still kept for consistency purposes and will be * deleted later by a DEL or because of timeout */ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; bool ret = false; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_entry_put(tt_global_entry); out: return ret; } /** * batadv_tt_local_client_is_roaming() - tells whether the client is roaming * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the local client to query * @vid: VLAN identifier * * Return: true if the local client is known to be roaming (it is not served by * this node anymore) or not. If yes, the client is still present in the table * to keep the latter consistent with the node TTVN */ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_local_entry_put(tt_local_entry); out: return ret; } /** * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which the temporary entry should be associated with * @addr: mac address of the client * @vid: VLAN id of the new temporary global translation table * * Return: true when temporary tt entry could be added, false otherwise */ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid) { /* ignore loop detect macs, they are not supposed to be in the tt local * data as well. */ if (batadv_bla_is_loopdetect_mac(addr)) return false; if (!batadv_tt_global_add(bat_priv, orig_node, addr, vid, BATADV_TT_CLIENT_TEMP, atomic_read(&orig_node->last_ttvn))) return false; batadv_dbg(BATADV_DBG_TT, bat_priv, "Added temporary global client (addr: %pM, vid: %d, orig: %pM)\n", addr, batadv_print_vid(vid), orig_node->orig); return true; } /** * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the * maximum packet size that can be transported through the mesh * @soft_iface: netdev struct of the mesh interface * * Remove entries older than 'timeout' and half timeout if more entries need * to be removed. */ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); int packet_size_max = atomic_read(&bat_priv->packet_size_max); int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2; bool reduced = false; spin_lock_bh(&bat_priv->tt.commit_lock); while (true) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; batadv_tt_local_purge(bat_priv, timeout); batadv_tt_local_purge_pending_clients(bat_priv); timeout /= 2; reduced = true; net_ratelimited_function(batadv_info, soft_iface, "Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n", packet_size_max); } /* commit these changes immediately, to avoid synchronization problem * with the TTVN */ if (reduced) batadv_tt_local_commit_changes_nolock(bat_priv); spin_unlock_bh(&bat_priv->tt.commit_lock); } /** * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); num_vlan = ntohs(tt_data->num_vlan); if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan) return; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan); tvlv_value_len -= sizeof(*tt_vlan) * num_vlan; num_entries = batadv_tt_entries(tvlv_value_len); batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change, num_entries, tt_data->ttvn); } /** * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv * container * @bat_priv: the bat priv with all the soft interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_tt_data *tt_data; u16 tt_vlan_len, tt_num_entries; char tt_flag; bool ret; if (tvlv_value_len < sizeof(*tt_data)) return NET_RX_SUCCESS; tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data); tt_vlan_len *= ntohs(tt_data->num_vlan); if (tvlv_value_len < tt_vlan_len) return NET_RX_SUCCESS; tvlv_value_len -= tt_vlan_len; tt_num_entries = batadv_tt_entries(tvlv_value_len); switch (tt_data->flags & BATADV_TT_DATA_TYPE_MASK) { case BATADV_TT_REQUEST: batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_RX); /* If this node cannot provide a TT response the tt_request is * forwarded */ ret = batadv_send_tt_response(bat_priv, tt_data, src, dst); if (!ret) { if (tt_data->flags & BATADV_TT_FULL_TABLE) tt_flag = 'F'; else tt_flag = '.'; batadv_dbg(BATADV_DBG_TT, bat_priv, "Routing TT_REQUEST to %pM [%c]\n", dst, tt_flag); /* tvlv API will re-route the packet */ return NET_RX_DROP; } break; case BATADV_TT_RESPONSE: batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_RX); if (batadv_is_my_mac(bat_priv, dst)) { batadv_handle_tt_response(bat_priv, tt_data, src, tt_num_entries); return NET_RX_SUCCESS; } if (tt_data->flags & BATADV_TT_FULL_TABLE) tt_flag = 'F'; else tt_flag = '.'; batadv_dbg(BATADV_DBG_TT, bat_priv, "Routing TT_RESPONSE to %pM [%c]\n", dst, tt_flag); /* tvlv API will re-route the packet */ return NET_RX_DROP; } return NET_RX_SUCCESS; } /** * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv * container * @bat_priv: the bat priv with all the soft interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_roam_adv *roaming_adv; struct batadv_orig_node *orig_node = NULL; /* If this node is not the intended recipient of the * roaming advertisement the packet is forwarded * (the tvlv API will re-route the packet). */ if (!batadv_is_my_mac(bat_priv, dst)) return NET_RX_DROP; if (tvlv_value_len < sizeof(*roaming_adv)) goto out; orig_node = batadv_orig_hash_find(bat_priv, src); if (!orig_node) goto out; batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX); roaming_adv = tvlv_value; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received ROAMING_ADV from %pM (client %pM)\n", src, roaming_adv->client); batadv_tt_global_add(bat_priv, orig_node, roaming_adv->client, ntohs(roaming_adv->vid), BATADV_TT_CLIENT_ROAM, atomic_read(&orig_node->last_ttvn) + 1); out: batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } /** * batadv_tt_init() - initialise the translation table internals * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure. */ int batadv_tt_init(struct batadv_priv *bat_priv) { int ret; /* synchronized flags must be remote */ BUILD_BUG_ON(!(BATADV_TT_SYNC_MASK & BATADV_TT_REMOTE_MASK)); ret = batadv_tt_local_init(bat_priv); if (ret < 0) return ret; ret = batadv_tt_global_init(bat_priv); if (ret < 0) { batadv_tt_local_table_free(bat_priv); return ret; } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_TT, 1, BATADV_NO_FLAGS); batadv_tvlv_handler_register(bat_priv, NULL, batadv_roam_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS); INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); return 1; } /** * batadv_tt_global_is_isolated() - check if a client is marked as isolated * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt; bool ret; tt = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt) return false; ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; batadv_tt_global_entry_put(tt); return ret; } /** * batadv_tt_cache_init() - Initialize tt memory object cache * * Return: 0 on success or negative error number in case of failure. */ int __init batadv_tt_cache_init(void) { size_t tl_size = sizeof(struct batadv_tt_local_entry); size_t tg_size = sizeof(struct batadv_tt_global_entry); size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry); size_t tt_change_size = sizeof(struct batadv_tt_change_node); size_t tt_req_size = sizeof(struct batadv_tt_req_node); size_t tt_roam_size = sizeof(struct batadv_tt_roam_node); batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tl_cache) return -ENOMEM; batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tg_cache) goto err_tt_tl_destroy; batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache", tt_orig_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_orig_cache) goto err_tt_tg_destroy; batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache", tt_change_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_change_cache) goto err_tt_orig_destroy; batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache", tt_req_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_req_cache) goto err_tt_change_destroy; batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache", tt_roam_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_roam_cache) goto err_tt_req_destroy; return 0; err_tt_req_destroy: kmem_cache_destroy(batadv_tt_req_cache); batadv_tt_req_cache = NULL; err_tt_change_destroy: kmem_cache_destroy(batadv_tt_change_cache); batadv_tt_change_cache = NULL; err_tt_orig_destroy: kmem_cache_destroy(batadv_tt_orig_cache); batadv_tt_orig_cache = NULL; err_tt_tg_destroy: kmem_cache_destroy(batadv_tg_cache); batadv_tg_cache = NULL; err_tt_tl_destroy: kmem_cache_destroy(batadv_tl_cache); batadv_tl_cache = NULL; return -ENOMEM; } /** * batadv_tt_cache_destroy() - Destroy tt memory object cache */ void batadv_tt_cache_destroy(void) { kmem_cache_destroy(batadv_tl_cache); kmem_cache_destroy(batadv_tg_cache); kmem_cache_destroy(batadv_tt_orig_cache); kmem_cache_destroy(batadv_tt_change_cache); kmem_cache_destroy(batadv_tt_req_cache); kmem_cache_destroy(batadv_tt_roam_cache); } |
1149 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Common values for SHA-2 algorithms */ #ifndef _CRYPTO_SHA2_H #define _CRYPTO_SHA2_H #include <linux/types.h> #define SHA224_DIGEST_SIZE 28 #define SHA224_BLOCK_SIZE 64 #define SHA256_DIGEST_SIZE 32 #define SHA256_BLOCK_SIZE 64 #define SHA384_DIGEST_SIZE 48 #define SHA384_BLOCK_SIZE 128 #define SHA512_DIGEST_SIZE 64 #define SHA512_BLOCK_SIZE 128 #define SHA224_H0 0xc1059ed8UL #define SHA224_H1 0x367cd507UL #define SHA224_H2 0x3070dd17UL #define SHA224_H3 0xf70e5939UL #define SHA224_H4 0xffc00b31UL #define SHA224_H5 0x68581511UL #define SHA224_H6 0x64f98fa7UL #define SHA224_H7 0xbefa4fa4UL #define SHA256_H0 0x6a09e667UL #define SHA256_H1 0xbb67ae85UL #define SHA256_H2 0x3c6ef372UL #define SHA256_H3 0xa54ff53aUL #define SHA256_H4 0x510e527fUL #define SHA256_H5 0x9b05688cUL #define SHA256_H6 0x1f83d9abUL #define SHA256_H7 0x5be0cd19UL #define SHA384_H0 0xcbbb9d5dc1059ed8ULL #define SHA384_H1 0x629a292a367cd507ULL #define SHA384_H2 0x9159015a3070dd17ULL #define SHA384_H3 0x152fecd8f70e5939ULL #define SHA384_H4 0x67332667ffc00b31ULL #define SHA384_H5 0x8eb44a8768581511ULL #define SHA384_H6 0xdb0c2e0d64f98fa7ULL #define SHA384_H7 0x47b5481dbefa4fa4ULL #define SHA512_H0 0x6a09e667f3bcc908ULL #define SHA512_H1 0xbb67ae8584caa73bULL #define SHA512_H2 0x3c6ef372fe94f82bULL #define SHA512_H3 0xa54ff53a5f1d36f1ULL #define SHA512_H4 0x510e527fade682d1ULL #define SHA512_H5 0x9b05688c2b3e6c1fULL #define SHA512_H6 0x1f83d9abfb41bd6bULL #define SHA512_H7 0x5be0cd19137e2179ULL extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE]; extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE]; extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE]; extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE]; struct sha256_state { u32 state[SHA256_DIGEST_SIZE / 4]; u64 count; u8 buf[SHA256_BLOCK_SIZE]; }; struct sha512_state { u64 state[SHA512_DIGEST_SIZE / 8]; u64 count[2]; u8 buf[SHA512_BLOCK_SIZE]; }; struct shash_desc; extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len); extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash); extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len); extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash); /* * Stand-alone implementation of the SHA256 algorithm. It is designed to * have as little dependencies as possible so it can be used in the * kexec_file purgatory. In other cases you should generally use the * hash APIs from include/crypto/hash.h. Especially when hashing large * amounts of data as those APIs may be hw-accelerated. * * For details see lib/crypto/sha256.c */ static inline void sha256_init(struct sha256_state *sctx) { sctx->state[0] = SHA256_H0; sctx->state[1] = SHA256_H1; sctx->state[2] = SHA256_H2; sctx->state[3] = SHA256_H3; sctx->state[4] = SHA256_H4; sctx->state[5] = SHA256_H5; sctx->state[6] = SHA256_H6; sctx->state[7] = SHA256_H7; sctx->count = 0; } void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len); void sha256_final(struct sha256_state *sctx, u8 *out); void sha256(const u8 *data, unsigned int len, u8 *out); static inline void sha224_init(struct sha256_state *sctx) { sctx->state[0] = SHA224_H0; sctx->state[1] = SHA224_H1; sctx->state[2] = SHA224_H2; sctx->state[3] = SHA224_H3; sctx->state[4] = SHA224_H4; sctx->state[5] = SHA224_H5; sctx->state[6] = SHA224_H6; sctx->state[7] = SHA224_H7; sctx->count = 0; } /* Simply use sha256_update as it is equivalent to sha224_update. */ void sha224_final(struct sha256_state *sctx, u8 *out); #endif /* _CRYPTO_SHA2_H */ |
448 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMER_H #define _LINUX_TIMER_H #include <linux/list.h> #include <linux/ktime.h> #include <linux/stddef.h> #include <linux/debugobjects.h> #include <linux/stringify.h> struct timer_list { /* * All fields that change during normal runtime grouped to the * same cacheline */ struct hlist_node entry; unsigned long expires; void (*function)(struct timer_list *); u32 flags; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting the lockdep_map key * (second argument) here is required, otherwise it could be initialised to * the copy of the lockdep_map later! We use the pointer to and the string * "<file>:<line>" as the key resp. the name of the lockdep_map. */ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn), #else #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif /** * @TIMER_DEFERRABLE: A deferrable timer will work normally when the * system is busy, but will not cause a CPU to come out of idle just * to service it; instead, the timer will be serviced when the CPU * eventually wakes up with a subsequent non-deferrable timer. * * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and * it's safe to wait for the completion of the running instance from * IRQ handlers, for example, by calling del_timer_sync(). * * Note: The irq disabled callback execution is a special case for * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! * * @TIMER_PINNED: A pinned timer will not be affected by any timer * placement heuristics (like, NOHZ) and will always expire on the CPU * on which the timer was enqueued. * * Note: Because enqueuing of timers can migrate the timer from one * CPU to another, pinned timers are not guaranteed to stay on the * initialy selected CPU. They move to the CPU on which the enqueue * function is invoked via mod_timer() or add_timer(). If the timer * should be placed on a particular CPU, then add_timer_on() has to be * used. */ #define TIMER_CPUMASK 0x0003FFFF #define TIMER_MIGRATING 0x00040000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) #define TIMER_DEFERRABLE 0x00080000 #define TIMER_PINNED 0x00100000 #define TIMER_IRQSAFE 0x00200000 #define TIMER_INIT_FLAGS (TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) #define TIMER_ARRAYSHIFT 22 #define TIMER_ARRAYMASK 0xFFC00000 #define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) #define __TIMER_INITIALIZER(_function, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .flags = (_flags), \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ } #define DEFINE_TIMER(_name, _function) \ struct timer_list _name = \ __TIMER_INITIALIZER(_function, 0) /* * LOCKDEP and DEBUG timer interfaces. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #else static inline void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { init_timer_key(timer, func, flags, name, key); } #endif #ifdef CONFIG_LOCKDEP #define __init_timer(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\ } while (0) #define __init_timer_on_stack(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ init_timer_on_stack_key((_timer), (_fn), (_flags), \ #_timer, &__key); \ } while (0) #else #define __init_timer(_timer, _fn, _flags) \ init_timer_key((_timer), (_fn), (_flags), NULL, NULL) #define __init_timer_on_stack(_timer, _fn, _flags) \ init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL) #endif /** * timer_setup - prepare a timer for first use * @timer: the timer in question * @callback: the function to call when timer expires * @flags: any TIMER_* flags * * Regular timer initialization should use either DEFINE_TIMER() above, * or timer_setup(). For timers on the stack, timer_setup_on_stack() must * be used and must be balanced with a call to destroy_timer_on_stack(). */ #define timer_setup(timer, callback, flags) \ __init_timer((timer), (callback), (flags)) #define timer_setup_on_stack(timer, callback, flags) \ __init_timer_on_stack((timer), (callback), (flags)) #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_timer_on_stack(struct timer_list *timer); #else static inline void destroy_timer_on_stack(struct timer_list *timer) { } #endif #define from_timer(var, callback_timer, timer_fieldname) \ container_of(callback_timer, typeof(*var), timer_fieldname) /** * timer_pending - is a timer pending? * @timer: the timer in question * * timer_pending will tell whether a given timer is currently pending, * or not. Callers must ensure serialization wrt. other operations done * to this timer, eg. interrupt contexts, or other CPUs on SMP. * * return value: 1 if the timer is pending, 0 if not. */ static inline int timer_pending(const struct timer_list * timer) { return !hlist_unhashed_lockless(&timer->entry); } extern void add_timer_on(struct timer_list *timer, int cpu); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); extern int timer_reduce(struct timer_list *timer, unsigned long expires); /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); /** * del_timer_sync - Delete a pending timer and wait for a running callback * @timer: The timer to be deleted * * See timer_delete_sync() for detailed explanation. * * Do not use in new code. Use timer_delete_sync() instead. */ static inline int del_timer_sync(struct timer_list *timer) { return timer_delete_sync(timer); } /** * del_timer - Delete a pending timer * @timer: The timer to be deleted * * See timer_delete() for detailed explanation. * * Do not use in new code. Use timer_delete() instead. */ static inline int del_timer(struct timer_list *timer) { return timer_delete(timer); } extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); unsigned long round_jiffies_relative(unsigned long j); unsigned long __round_jiffies_up(unsigned long j, int cpu); unsigned long __round_jiffies_up_relative(unsigned long j, int cpu); unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else #define timers_prepare_cpu NULL #define timers_dead_cpu NULL #endif #endif |
990 989 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_LABELS_H #define _NF_CONNTRACK_LABELS_H #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <uapi/linux/netfilter/xt_connlabel.h> #define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) struct nf_conn_labels { unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)]; }; /* Can't use nf_ct_ext_find(), flow dissector cannot use symbols * exported by nf_conntrack module. */ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS struct nf_ct_ext *ext = ct->ext; if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS)) return NULL; return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS]; #else return NULL; #endif } static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS struct net *net = nf_ct_net(ct); if (net->ct.labels_used == 0) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC); #else return NULL; #endif } int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words); #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; } static inline void nf_connlabels_put(struct net *net) {} #endif #endif /* _NF_CONNTRACK_LABELS_H */ |
456 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bpf_test_run #if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_TEST_RUN_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), TP_ARGS(err), TP_STRUCT__entry( __field(int, err) ), TP_fast_assign( __entry->err = *err; ), TP_printk("bpf_test_finish with err=%d", __entry->err) ); #ifdef DEFINE_EVENT_WRITABLE #undef BPF_TEST_RUN_DEFINE_EVENT #define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ PARAMS(args), size) #else #undef BPF_TEST_RUN_DEFINE_EVENT #define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) #endif BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish, TP_PROTO(int *err), TP_ARGS(err), sizeof(int) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h> |
1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/crc-ccitt.c */ #include <linux/types.h> #include <linux/module.h> #include <linux/crc-ccitt.h> /* * This mysterious table is just the CRC of each possible byte. It can be * computed using the standard bit-at-a-time methods. The polynomial can * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12. * Add the implicit x^16, and you have the standard CRC-CCITT. */ u16 const crc_ccitt_table[256] = { 0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf, 0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7, 0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e, 0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876, 0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd, 0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5, 0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c, 0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974, 0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb, 0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3, 0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a, 0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72, 0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9, 0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1, 0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738, 0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70, 0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7, 0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff, 0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036, 0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e, 0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5, 0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd, 0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134, 0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c, 0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3, 0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb, 0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232, 0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a, 0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1, 0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9, 0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330, 0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78 }; EXPORT_SYMBOL(crc_ccitt_table); /* * Similar table to calculate CRC16 variant known as CRC-CCITT-FALSE * Reflected bits order, does not augment final value. */ u16 const crc_ccitt_false_table[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7, 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF, 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6, 0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE, 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485, 0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D, 0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4, 0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC, 0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823, 0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B, 0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12, 0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A, 0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41, 0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49, 0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70, 0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78, 0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F, 0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067, 0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E, 0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256, 0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D, 0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, 0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C, 0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634, 0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB, 0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3, 0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A, 0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92, 0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9, 0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1, 0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8, 0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0 }; EXPORT_SYMBOL(crc_ccitt_false_table); /** * crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data * buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer */ u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len) { while (len--) crc = crc_ccitt_byte(crc, *buffer++); return crc; } EXPORT_SYMBOL(crc_ccitt); /** * crc_ccitt_false - recompute the CRC (CRC-CCITT-FALSE variant) * for the data buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer */ u16 crc_ccitt_false(u16 crc, u8 const *buffer, size_t len) { while (len--) crc = crc_ccitt_false_byte(crc, *buffer++); return crc; } EXPORT_SYMBOL(crc_ccitt_false); MODULE_DESCRIPTION("CRC-CCITT calculations"); MODULE_LICENSE("GPL"); |
120 120 119 119 119 3 120 96 24 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 | // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/sort.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> #include <linux/fs_parser.h> #include <trace/events/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls * Expiring in the middle is a performance problem not a correctness one. * 1 sec should be enough. */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached * * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroup_root *root; int retval = 0; cgroup_lock(); cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } cgroup_attach_unlock(true); cgroup_unlock(); return retval; } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * * Locking rules between cgroup_post_fork() and the migration path * guarantee that, if a task is forking while being migrated, the new child * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. * * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; int ret; if (cgroup_on_dfl(to)) return -EINVAL; ret = cgroup_migrate_vet_dst(to); if (ret) return ret; cgroup_lock(); cgroup_attach_lock(true); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; /* * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { css_task_iter_start(&from->self, 0, &it); do { task = css_task_iter_next(&it); } while (task && (task->flags & PF_EXITING)); if (task) get_task_struct(task); css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(true); cgroup_unlock(); return ret; } /* * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* which pidlist file are we talking about? */ enum cgroup_filetype { CGROUP_FILE_PROCS, CGROUP_FILE_TASKS, }; /* * A pidlist is a list of pids that virtually represents the contents of one * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, * a pair (one each for procs, tasks) for each pid namespace that's relevant * to the cgroup. */ struct cgroup_pidlist { /* * used to find which pidlist is wanted. doesn't change as long as * this particular list stays in the list. */ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* for delayed destruction */ struct delayed_work destroy_dwork; }; /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); mutex_unlock(&cgrp->pidlist_mutex); flush_workqueue(cgroup_pidlist_destroy_wq); BUG_ON(!list_empty(&cgrp->pidlists)); } static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, destroy_dwork); struct cgroup_pidlist *tofree = NULL; mutex_lock(&l->owner->pidlist_mutex); /* * Destroy iff we didn't get queued again. The state won't change * as destroy_dwork can only be queued while locked. */ if (!delayed_work_pending(dwork)) { list_del(&l->links); kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } mutex_unlock(&l->owner->pidlist_mutex); kfree(tofree); } /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; /* * we presume the 0th element is unique, so i starts at 1. trivial * edge cases first; no work needs to be done for either */ if (length == 0 || length == 1) return length; /* src and dest walk down the list; dest counts unique elements */ for (src = 1; src < length; src++) { /* find next unique element */ while (list[src] == list[src-1]) { src++; if (src == length) goto after; } /* dest always points to where the next unique element goes */ list[dest] = list[src]; dest++; } after: return dest; } /* * The two pid files - task and cgroup.procs - guaranteed that the result * is sorted, which forced this whole pidlist fiasco. As pid order is * different per namespace, each namespace needs differently sorted list, * making it impossible to use, for example, single rbtree of member tasks * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. */ static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ struct pid_namespace *ns = task_active_pid_ns(current); lockdep_assert_held(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) if (l->key.type == type && l->key.ns == ns) return l; return NULL; } /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); l = cgroup_pidlist_find(cgrp, type); if (l) return l; /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) return l; INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; /* don't need task_nsproxy() if we're looking at ourself */ l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); return l; } /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct cgroup_pidlist **lp) { pid_t *array; int length; int pid, n = 0; /* used for populating the array */ struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ length = cgroup_task_count(cgrp); array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ if (type == CGROUP_FILE_PROCS) pid = task_tgid_vnr(tsk); else pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ kvfree(l->list); l->list = array; l->length = length; *lp = l; return 0; } /* * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->l->list array. */ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; int *iter, ret; mutex_lock(&cgrp->pidlist_mutex); /* * !NULL @ctx->procs1.pidlist indicates that this isn't the first * start() after open. If the matching pidlist is around, we can use * that. Look for it. Note that @ctx->procs1.pidlist can't be used * directly. It could already have been destroyed. */ if (ctx->procs1.pidlist) ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ if (!ctx->procs1.pidlist) { ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } l = ctx->procs1.pidlist; if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; if (l->list[mid] == pid) { index = mid; break; } else if (l->list[mid] < pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; *pos = *iter; return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, CGROUP_PIDLIST_DESTROY_DELAY); mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { (*pos)++; return NULL; } else { *pos = *p; return p; } } static int cgroup_pidlist_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *(int *)v); return 0; } static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct cgroup *cgrp; struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* * Even if we're attaching all tasks in the thread group, we only need * to check permissions on one of them. Check permissions using the * credentials from file open to protect against inherited fd attacks. */ cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ ctx = of->priv; if ((ctx->ns->user_ns != &init_user_ns) || !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { seq_puts(seq, "0\n"); return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { return notify_on_release(css->cgroup); } static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } static int cgroup_clone_children_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, { .name = "tasks", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup1_tasks_write, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; /* Display information about each subsystem and each hierarchy */ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* * Grab the subsystems state racily. No need to add avenue to * cgroup_mutex contention. */ for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); return 0; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. * * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; /* it should be kernfs_node belonging to cgroupfs and is a directory */ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || kernfs_type(kn) != KERNFS_DIR) return -EINVAL; /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (tsk->in_iowait) stats->nr_io_wait++; break; } } css_task_iter_end(&it); cgroup_put(cgrp); return 0; } void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; /* snoop agent path and exit early if empty */ if (!cgrp->root->release_agent_path[0]) return; /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; spin_lock(&release_agent_path_lock); strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0 || ret >= PATH_MAX) goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); out_free: kfree(agentbuf); kfree(pathbuf); } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(new_name_str, '\n')) return -EINVAL; if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) return -EIO; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); return ret; } static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_show_option(seq, "release_agent", root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); return 0; } enum cgroup1_param { Opt_all, Opt_clone_children, Opt_cpuset_v2_mode, Opt_name, Opt_none, Opt_noprefix, Opt_release_agent, Opt_xattr, Opt_favordynmods, Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), fsparam_string("name", Opt_name), fsparam_flag ("none", Opt_none), fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), fsparam_flag ("favordynmods", Opt_favordynmods), fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_subsys *ss; struct fs_parse_result result; int opt, i; opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", param->key); ctx->subsys_mask |= (1 << i); return 0; } return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; switch (opt) { case Opt_none: /* Explicitly have no subsystems */ ctx->none = true; break; case Opt_all: ctx->all_ss = true; break; case Opt_noprefix: ctx->flags |= CGRP_ROOT_NOPREFIX; break; case Opt_clone_children: ctx->cpuset_clone_children = true; break; case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; break; case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_nofavordynmods: ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; case Opt_name: /* blocked by boot param? */ if (cgroup_no_v1_named) return -ENOENT; /* Can't specify an empty name */ if (!param->size) return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; if (isalnum(c)) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; } return 0; } static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); u16 mask = U16_MAX; u16 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) enabled |= 1 << i; ctx->subsys_mask &= enabled; /* * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) ctx->all_ss = true; if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) return invalfc(fc, "none used incorrectly"); return 0; } int cgroup1_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = ctx->subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } /* remounting is not allowed for populated hierarchies */ if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (ctx->release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: cgroup_unlock(); return ret; } struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; /* * The guts of cgroup1 mount - find or create cgroup_root to use. * Called with cgroup_mutex held; returns 0 on success, -E... on * error and positive - in case when the candidate is busy dying. * On success it stashes a reference to cgroup_root into given * cgroup_fs_context; that reference is *NOT* counting towards the * cgroup_root refcount. */ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret; /* First find the desired set of subsystems */ ret = check_cgroupfs_options(fc); if (ret) return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) return 1; /* restart */ cgroup_put(&ss->root->cgrp); } for_each_root(root) { bool name_match = false; if (root == &cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (ctx->name) { if (strcmp(ctx->name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((ctx->subsys_mask || ctx->none) && (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; return -EBUSY; } if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); ctx->root = root; return 0; } /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return -ENOMEM; ctx->root = root; init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); if (!ret) cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); else cgroup_free_root(root); return ret; } int cgroup1_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; /* Check if the caller has permission to mount. */ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); ret = cgroup1_root_to_use(fc); if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { fc_drop_locked(fc); ret = 1; } if (unlikely(ret > 0)) { msleep(10); return restart_syscall(); } return ret; } static int __init cgroup1_wq_init(void) { /* * Used to destroy pidlists and separate to serve as flush domain. * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 0, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } core_initcall(cgroup1_wq_init); static int __init cgroup_no_v1(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; continue; } if (!strcmp(token, "named")) { cgroup_no_v1_named = true; continue; } for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; cgroup_no_v1_mask |= 1 << i; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); |
1465 1465 1460 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .gid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .projid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); #else return 0; #endif } static void user_epoll_free(struct user_struct *up) { #ifdef CONFIG_EPOLL percpu_counter_destroy(&up->epoll_watches); #endif } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); user_epoll_free(up); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); if (user_epoll_alloc(&root_user)) panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init); |
5472 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch); |
729 729 729 781 728 454 454 278 278 727 728 728 781 728 728 728 781 781 781 780 781 781 728 781 781 728 781 781 780 781 1103 1103 1103 69 1102 955 1102 1103 1103 993 781 781 781 781 780 781 280 271 271 1102 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else off = get_random_u16(); attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V4_MAXIP]) range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); else range->max_addr.ip = range->min_addr.ip; return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); |
363 363 363 363 342 342 342 342 342 63 8 342 342 15346 15343 342 342 342 5145 3 15 1661 3142 3136 5144 363 363 363 363 363 363 363 363 363 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 | // SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: * Arnaldo Carvalho de Melo <acme@redhat.com> * * Based on code from the latency_tracer, that is: * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <generated/utsrelease.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/irqflags.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ #include "trace.h" #include "trace_output.h" /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ bool ring_buffer_expanded; #ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ static bool __read_mostly tracing_selftest_running; /* * If boot-time tracing including tracers/events via kernel cmdline * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { tracing_selftest_disabled = true; pr_info("Ftrace startup test is disabled due to %s\n", reason); } } #else #define tracing_selftest_running 0 #define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } }; static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event * occurred. */ static DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets * this back to zero. */ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for * capturing traces that lead to crashes and outputing it to a * serial console. * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops */ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; }; union trace_eval_map_item; struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; struct trace_eval_map_head head; struct trace_eval_map_tail tail; }; static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; static bool snapshot_at_boot; static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; return 1; } __setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { if (*str++ != '=' || !*str || !strcmp("1", str)) { ftrace_dump_on_oops = DUMP_ALL; return 1; } if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { ftrace_dump_on_oops = DUMP_ORIG; return 1; } return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) __disable_trace_on_warning = 1; return 1; } __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { char *slot = boot_snapshot_info + boot_snapshot_index; int left = sizeof(boot_snapshot_info) - boot_snapshot_index; int ret; if (str[0] == '=') { str++; if (strlen(str) >= left) return -1; ret = snprintf(slot, left, "%s\t", str); boot_snapshot_index += ret; } else { allocate_snapshot = true; /* We also need the main ring buffer expanded */ ring_buffer_expanded = true; } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); static int __init boot_snapshot(char *str) { snapshot_at_boot = true; boot_alloc_snapshot(str); return 1; } __setup("ftrace_boot_snapshot", boot_snapshot); static int __init boot_instance(char *str) { char *slot = boot_instance_info + boot_instance_index; int left = sizeof(boot_instance_info) - boot_instance_index; int ret; if (strlen(str) >= left) return -1; ret = snprintf(slot, left, "%s\t", str); boot_instance_index += ret; return 1; } __setup("trace_instance=", boot_instance); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { /* Ignore the "tp_printk_stop_on_boot" param */ if (*str == '_') return 0; if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; } __setup("tp_printk", set_tracepoint_printk); static int __init set_tracepoint_printk_stop(char *str) { tracepoint_printk_stop_on_boot = true; return 1; } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); unsigned long long ns2usecs(u64 nsec) { nsec += 500; do_div(nsec, 1000); return nsec; } static void trace_process_export(struct trace_export *export, struct ring_buffer_event *event, int flag) { struct trace_entry *entry; unsigned int size = 0; if (export->flags & flag) { entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); export->write(export, entry, size); } } static DEFINE_MUTEX(ftrace_export_lock); static struct trace_export __rcu *ftrace_exports_list __read_mostly; static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); static inline void ftrace_exports_enable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_inc(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_inc(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_inc(&trace_marker_exports_enabled); } static inline void ftrace_exports_disable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_dec(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_dec(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_dec(&trace_marker_exports_enabled); } static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; preempt_disable_notrace(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); } static inline void add_trace_export(struct trace_export **list, struct trace_export *export) { rcu_assign_pointer(export->next, *list); /* * We are entering export into the list but another * CPU might be walking that list. We need to make sure * the export->next pointer is valid before another CPU sees * the export pointer included into the list. */ rcu_assign_pointer(*list, export); } static inline int rm_trace_export(struct trace_export **list, struct trace_export *export) { struct trace_export **p; for (p = list; *p != NULL; p = &(*p)->next) if (*p == export) break; if (*p != export) return -1; rcu_assign_pointer(*p, (*p)->next); return 0; } static inline void add_ftrace_export(struct trace_export **list, struct trace_export *export) { ftrace_exports_enable(export); add_trace_export(list, export); } static inline int rm_ftrace_export(struct trace_export **list, struct trace_export *export) { int ret; ret = rm_trace_export(list, export); ftrace_exports_disable(export); return ret; } int register_ftrace_export(struct trace_export *export) { if (WARN_ON_ONCE(!export->write)) return -1; mutex_lock(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); mutex_unlock(&ftrace_export_lock); return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { int ret; mutex_lock(&ftrace_export_lock); ret = rm_ftrace_export(&ftrace_exports_list, export); mutex_unlock(&ftrace_export_lock); return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ TRACE_ITER_HASH_PTR) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) /* * The global_trace is the descriptor that holds the top-level tracing * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; int ret = -ENODEV; mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; ret = 0; break; } } mutex_unlock(&trace_types_lock); return ret; } static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; } /** * trace_array_put - Decrement the reference counter for this trace array. * @this_tr : pointer to the trace array * * NOTE: Use this when we no longer need the trace array returned by * trace_array_get_by_name(). This ensures the trace array can be later * destroyed. * */ void trace_array_put(struct trace_array *this_tr) { if (!this_tr) return; mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; if (tracing_disabled) return -ENODEV; if (tr && trace_array_get(tr) < 0) return -ENODEV; return 0; } int call_filter_check_discard(struct trace_event_call *call, void *rec, struct trace_buffer *buffer, struct ring_buffer_event *event) { if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { __trace_event_discard_commit(buffer, event); return 1; } return 0; } /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { return trace_pid_list_is_set(filtered_pids, search_pid); } /** * trace_ignore_this_task - should a task be ignored for tracing * @filtered_pids: The list of pids to check * @filtered_no_pids: The list of pids not to be traced * @task: The task that should be ignored if not filtered * * Checks if @task should be traced or not from @filtered_pids. * Returns true if @task should *NOT* be traced. * Returns false if @task should be traced. */ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct trace_pid_list *filtered_no_pids, struct task_struct *task) { /* * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids * within filtered_pids. */ return (filtered_pids && !trace_find_filtered_pid(filtered_pids, task->pid)) || (filtered_no_pids && trace_find_filtered_pid(filtered_no_pids, task->pid)); } /** * trace_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove * * If adding a task, if @self is defined, the task is only added if @self * is also included in @pid_list. This happens on fork and tasks should * only be added when the parent is listed. If @self is NULL, then the * @task pid will be removed from the list, which would happen on exit * of a task. */ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task) { if (!pid_list) return; /* For forks, we only add if the forking task is listed */ if (self) { if (!trace_find_filtered_pid(pid_list, self->pid)) return; } /* "self" is set for forks, and NULL for exits */ if (self) trace_pid_list_set(pid_list, task->pid); else trace_pid_list_clear(pid_list, task->pid); } /** * trace_pid_next - Used for seq_file to get to the next pid of a pid_list * @pid_list: The pid list to show * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) * @pos: The position of the file * * This is used by the seq_file "next" operation to iterate the pids * listed in a trace_pid_list structure. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { long pid = (unsigned long)v; unsigned int next; (*pos)++; /* pid already is +1 of the actual previous bit */ if (trace_pid_list_next(pid_list, pid, &next) < 0) return NULL; pid = next; /* Return pid + 1 to allow zero to be represented */ return (void *)(pid + 1); } /** * trace_pid_start - Used for seq_file to start reading pid lists * @pid_list: The pid list to show * @pos: The position of the file * * This is used by seq_file "start" operation to start the iteration * of listing pids. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; unsigned int first; loff_t l = 0; if (trace_pid_list_first(pid_list, &first) < 0) return NULL; pid = first; /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) ; return (void *)pid; } /** * trace_pid_show - show the current pid in seq_file processing * @m: The seq_file structure to write into * @v: A void pointer of the pid (+1) value to display * * Can be directly used by seq_file operations to display the current * pid value. */ int trace_pid_show(struct seq_file *m, void *v) { unsigned long pid = (unsigned long)v - 1; seq_printf(m, "%lu\n", pid); return 0; } /* 128 should be much more than enough */ #define PID_BUF_SIZE 127 int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt) { struct trace_pid_list *pid_list; struct trace_parser parser; unsigned long val; int nr_pids = 0; ssize_t read = 0; ssize_t ret; loff_t pos; pid_t pid; if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) return -ENOMEM; /* * Always recreate a new array. The write is an all or nothing * operation. Always create a new array when adding new pids by * the user. If the operation fails, then the current list is * not modified. */ pid_list = trace_pid_list_alloc(); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } if (filtered_pids) { /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { trace_pid_list_set(pid_list, pid); ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } } ret = 0; while (cnt > 0) { pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; if (!trace_parser_loaded(&parser)) break; ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; pid = (pid_t)val; if (trace_pid_list_set(pid_list, pid) < 0) { ret = -1; break; } nr_pids++; trace_parser_clear(&parser); ret = 0; } trace_parser_put(&parser); if (ret < 0) { trace_pid_list_free(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); pid_list = NULL; } *new_pid_list = pid_list; return read; } static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ if (!buf->buffer) return trace_clock_local(); ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } u64 ftrace_now(int cpu) { return buffer_ftrace_now(&global_trace.array_buffer, cpu); } /** * tracing_is_enabled - Show if global_trace has been enabled * * Shows if the global trace has been enabled or not. It uses the * mirror flag "buffer_disabled" to be used in fast paths such as for * the irqsoff tracer. But it may be inaccurate due to races. If you * need to know the accurate state, use tracing_is_on() which is a little * slower, but accurate. */ int tracing_is_enabled(void) { /* * For quick access (irqsoff uses this in fast path), just * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ smp_rmb(); return !global_trace.buffer_disabled; } /* * trace_buf_size is the size in bytes that is allocated * for a buffer. Note, the number of bytes is always rounded * to page size. * * This number is purposely set to a low number of 16384. * If the dump on oops happens, it will be much appreciated * to not have to wait for all that output. Anyway this can be * boot time and run time configurable. */ #define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer * * ring buffer serializes readers, but it is low level protection. * The validity of the events (which returns by ring_buffer_peek() ..etc) * are not protected by ring buffer. * * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. * * These primitives allow multi process access to different cpu ring buffer * concurrently. * * These primitives don't distinguish read-only and read-consume access. * Multi read-only access are also serialized. */ #ifdef CONFIG_SMP static DECLARE_RWSEM(all_cpu_access_lock); static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ mutex_lock(&per_cpu(cpu_access_lock, cpu)); } } static inline void trace_access_unlock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); up_read(&all_cpu_access_lock); } } static inline void trace_access_lock_init(void) { int cpu; for_each_possible_cpu(cpu) mutex_init(&per_cpu(cpu_access_lock, cpu)); } #else static DEFINE_MUTEX(access_lock); static inline void trace_access_lock(int cpu) { (void)cpu; mutex_lock(&access_lock); } static inline void trace_access_unlock(int cpu) { (void)cpu; mutex_unlock(&access_lock); } static inline void trace_access_lock_init(void) { } #endif #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); #else static inline void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long trace_ctx, int skip, struct pt_regs *regs) { } #endif static __always_inline void trace_event_setup(struct ring_buffer_event *event, int type, unsigned int trace_ctx) { struct trace_entry *ent = ring_buffer_event_data(event); tracing_generic_entry_update(ent, type, trace_ctx); } static __always_inline struct ring_buffer_event * __trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx) { struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) trace_event_setup(event, type, trace_ctx); return event; } void tracer_tracing_on(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_on(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 0; /* Make the flag seen by readers */ smp_wmb(); } /** * tracing_on - enable tracing buffers * * This function enables tracing buffers that may have been * disabled with tracing_off. */ void tracing_on(void) { tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { /* Length is in event->array[0] */ ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer); } int __trace_array_puts(struct trace_array *tr, unsigned long ip, const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct print_entry *entry; unsigned int trace_ctx; int alloc; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running && tr == &global_trace)) return 0; if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); if (!event) { size = 0; goto out; } entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, str, size); /* Add a newline if necessary */ if (entry->buf[size - 1] != '\n') { entry->buf[size] = '\n'; entry->buf[size + 1] = '\0'; } else entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller * @str: The constant string to write * @size: The size of the string. */ int __trace_puts(unsigned long ip, const char *str, int size) { return __trace_array_puts(&global_trace, ip, str, size); } EXPORT_SYMBOL_GPL(__trace_puts); /** * __trace_bputs - write the pointer to a constant string into trace buffer * @ip: The address of the caller * @str: The constant string to write to the buffer to */ int __trace_bputs(unsigned long ip, const char *str) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); int ret = 0; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; trace_ctx = tracing_gen_ctx(); buffer = global_trace.array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->str = str; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); ret = 1; out: ring_buffer_nest_end(buffer); return ret; } EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT static void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; if (in_nmi()) { trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); trace_array_puts(tr, "*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); trace_array_puts(tr, "*** stopping trace here! ***\n"); tracer_tracing_off(tr); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } void tracing_snapshot_instance(struct trace_array *tr) { tracing_snapshot_instance_cond(tr, NULL); } /** * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually * with: echo 1 > /sys/kernel/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. */ void tracing_snapshot(void) { struct trace_array *tr = &global_trace; tracing_snapshot_instance(tr); } EXPORT_SYMBOL_GPL(tracing_snapshot); /** * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. * @tr: The tracing instance to snapshot * @cond_data: The data to be tested conditionally, and possibly saved * * This is the same as tracing_snapshot() except that the snapshot is * conditional - the snapshot will only happen if the * cond_snapshot.update() implementation receiving the cond_data * returns true, which means that the trace array's cond_snapshot * update() operation used the cond_data to determine whether the * snapshot should be taken, and if it was, presumably saved it along * with the snapshot. */ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { tracing_snapshot_instance_cond(tr, cond_data); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using * tracing_snapshot_cond_enable(), the user-defined cond_data is saved * with the snapshot. This accessor is used to retrieve it. * * Should not be called from cond_snapshot.update(), since it takes * the tr->max_lock lock, which the code calling * cond_snapshot.update() has already done. * * Returns the cond_data associated with the trace array's snapshot. */ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); local_irq_enable(); return cond_data; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { int ret; if (!tr->allocated_snapshot) { /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; tr->allocated_snapshot = true; } return 0; } static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); tr->allocated_snapshot = false; } /** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already * allocated - it doesn't also take a snapshot. * * This is meant to be used in cases where the snapshot buffer needs * to be set up for events that can't sleep but need to be able to * trigger a snapshot. */ int tracing_alloc_snapshot(void) { struct trace_array *tr = &global_trace; int ret; ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. */ void tracing_snapshot_alloc(void) { int ret; ret = tracing_alloc_snapshot(); if (ret < 0) return; tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); /** * tracing_snapshot_cond_enable - enable conditional snapshot for an instance * @tr: The tracing instance * @cond_data: User data to associate with the snapshot * @update: Implementation of the cond_snapshot update function * * Check whether the conditional snapshot for the given instance has * already been enabled, or if the current tracer is already using a * snapshot; if so, return -EBUSY, else create a cond_snapshot and * save the cond_data and update function inside. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { struct cond_snapshot *cond_snapshot; int ret = 0; cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; mutex_lock(&trace_types_lock); ret = tracing_alloc_snapshot_instance(tr); if (ret) goto fail_unlock; if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto fail_unlock; } /* * The cond_snapshot can only change to NULL without the * trace_types_lock. We don't care if we race with it going * to NULL, but we want to make sure that it's not set to * something other than NULL when we get here, which we can * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ if (tr->cond_snapshot) { ret = -EBUSY; goto fail_unlock; } local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; arch_spin_unlock(&tr->max_lock); local_irq_enable(); mutex_unlock(&trace_types_lock); return ret; fail_unlock: mutex_unlock(&trace_types_lock); kfree(cond_snapshot); return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); /** * tracing_snapshot_cond_disable - disable conditional snapshot for an instance * @tr: The tracing instance * * Check whether the conditional snapshot for the given instance is * enabled; if so, free the cond_snapshot associated with it, * otherwise return -EINVAL. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) ret = -EINVAL; else { kfree(tr->cond_snapshot); tr->cond_snapshot = NULL; } arch_spin_unlock(&tr->max_lock); local_irq_enable(); return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); void *tracing_cond_snapshot_data(struct trace_array *tr) { return NULL; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); int tracing_snapshot_cond_disable(struct trace_array *tr) { return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_off(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 1; /* Make the flag seen by readers */ smp_wmb(); } /** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. * It does not disable any overhead the tracers themselves may * be causing. This function simply causes all recording to * the ring buffers to fail. */ void tracing_off(void) { tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { if (__disable_trace_on_warning) { trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, "Disabling tracing due to warning\n"); tracing_off(); } } /** * tracer_tracing_is_on - show real state of ring buffer enabled * @tr : the trace array to know if ring buffer is enabled * * Shows real state of the ring buffer if it is enabled or not. */ bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->array_buffer.buffer) return ring_buffer_record_is_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); static int __init set_buf_size(char *str) { unsigned long buf_size; if (!str) return 0; buf_size = memparse(str, &str); /* * nr_entries can not be zero and the startup * tests require some buffer space. Therefore * ensure we have at least 4096 bytes of buffer. */ trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { unsigned long threshold; int ret; if (!str) return 0; ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; } /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b /* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL }; static struct { u64 (*func)(void); const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; bool trace_clock_in_ns(struct trace_array *tr) { if (trace_clocks[tr->clock_id].in_ns) return true; return false; } /* * trace_parser_get_init - gets the buffer for trace parser */ int trace_parser_get_init(struct trace_parser *parser, int size) { memset(parser, 0, sizeof(*parser)); parser->buffer = kmalloc(size, GFP_KERNEL); if (!parser->buffer) return 1; parser->size = size; return 0; } /* * trace_parser_put - frees the buffer for trace parser */ void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); parser->buffer = NULL; } /* * trace_get_user - reads the user input string separated by space * (matched by isspace(ch)) * * For each string found the 'struct trace_parser' is updated, * and the function returns. * * Returns number of bytes read. * * See kernel/trace/trace.h for 'struct trace_parser' details. */ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos) { char ch; size_t read = 0; ssize_t ret; if (!*ppos) trace_parser_clear(parser); ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; /* * The parser is not finished with the last write, * continue reading the user input without skipping spaces. */ if (!parser->cont) { /* skip white space */ while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; } parser->idx = 0; /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } } /* read the non-space input */ while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; goto out; } ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; } /* We either got finished input or we have to wait for another call. */ if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; } *ppos += read; ret = read; out: return ret; } /* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; if (trace_seq_used(s) <= s->seq.readpos) return -EBUSY; len = trace_seq_used(s) - s->seq.readpos; if (cnt > len) cnt = len; memcpy(buf, s->buffer + s->seq.readpos, cnt); s->seq.readpos += cnt; return cnt; } unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY static struct workqueue_struct *fsnotify_wq; static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) { struct trace_array *tr = container_of(iwork, struct trace_array, fsnotify_irqwork); queue_work(fsnotify_wq, &tr->fsnotify_work); } static void trace_create_maxlat_file(struct trace_array *tr, struct dentry *d_tracer) { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, d_tracer, tr, &tracing_max_lat_fops); } __init static int latency_fsnotify_init(void) { fsnotify_wq = alloc_workqueue("tr_max_lat_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!fsnotify_wq) { pr_err("Unable to allocate tr_max_lat_wq\n"); return -ENOMEM; } return 0; } late_initcall_sync(latency_fsnotify_init); void latency_fsnotify(struct trace_array *tr) { if (!fsnotify_wq) return; /* * We cannot call queue_work(&tr->fsnotify_work) from here because it's * possible that we are called from __schedule() or do_idle(), which * could cause a deadlock. */ irq_work_queue(&tr->fsnotify_irqwork); } #else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, tr, &tracing_max_lat_fops) #endif /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct array_buffer *trace_buf = &tr->array_buffer; struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use * RCU. The irq tracer can be called out of RCU scope. */ if (tsk == current) max_data->uid = current_uid(); else max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; /* record this tasks comm */ tracing_record_cmdline(tsk); latency_fsnotify(tr); } /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data) { if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); /* Inherit the recordable setting from array_buffer */ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) ring_buffer_record_on(tr->max_buffer.buffer); else ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { arch_spin_unlock(&tr->max_lock); return; } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } /** * update_max_tr_single - only copy one trace over, and reset the rest * @tr: tracer * @tsk: task with the latency * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* * We failed to swap the buffer due to a commit taking * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ static int wait_on_pipe(struct trace_iterator *iter, int full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); } #ifdef CONFIG_FTRACE_STARTUP_TEST static bool selftests_can_run; struct trace_selftests { struct list_head list; struct tracer *type; }; static LIST_HEAD(postponed_selftests); static int save_selftest(struct tracer *type) { struct trace_selftests *selftest; selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); if (!selftest) return -ENOMEM; selftest->type = type; list_add(&selftest->list, &postponed_selftests); return 0; } static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; struct tracer *saved_tracer = tr->current_trace; int ret; if (!type->selftest || tracing_selftest_disabled) return 0; /* * If a tracer registers early in boot up (before scheduling is * initialized and such), then do not run its selftests yet. * Instead, run it a little later in the boot process. */ if (!selftests_can_run) return save_selftest(type); if (!tracing_is_on()) { pr_warn("Selftest for tracer %s skipped due to tracing disabled\n", type->name); return 0; } /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ if (ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; } #endif /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ WARN_ON(1); return -1; } /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } #endif printk(KERN_CONT "PASSED\n"); return 0; } static int do_run_tracer_selftest(struct tracer *type) { int ret; /* * Tests can take a long time, especially if they are run one after the * other, as does happen during bootup when all the tracers are * registered. This could cause the soft lockup watchdog to trigger. */ cond_resched(); tracing_selftest_running = true; ret = run_tracer_selftest(type); tracing_selftest_running = false; return ret; } static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; struct tracer *t, **last; int ret; selftests_can_run = true; mutex_lock(&trace_types_lock); if (list_empty(&postponed_selftests)) goto out; pr_info("Running postponed tracer tests:\n"); tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. */ cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { WARN(1, "tracer: %s failed selftest, disabling\n", p->type->name); last = &trace_types; for (t = trace_types; t; t = t->next) { if (t == p->type) { *last = t->next; break; } last = &t->next; } } list_del(&p->list); kfree(p); } tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); return 0; } core_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { return 0; } static inline int do_run_tracer_selftest(struct tracer *type) { return 0; } #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer * * Register a new plugin tracer. */ int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; if (!type->name) { pr_info("Tracer must have a name\n"); return -1; } if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not register tracer %s due to lockdown\n", type->name); return -EPERM; } mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; } } if (!type->set_flag) type->set_flag = &dummy_set_flag; if (!type->flags) { /*allocate a dummy tracer_flags*/ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); if (!type->flags) { ret = -ENOMEM; goto out; } type->flags->val = 0; type->flags->opts = dummy_tracer_opt; } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; /* store the tracer for __set_tracer_option */ type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) goto out; type->next = trace_types; trace_types = type; add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) goto out_unlock; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; apply_trace_boot_options(); /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); out_unlock: return ret; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; lockdep_assert_held(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->clear_trace) continue; tr->clear_trace = false; tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } } void tracing_reset_all_online_cpus(void) { mutex_lock(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); mutex_unlock(&trace_types_lock); } /* * The tgid_map array maps from pid to tgid; i.e. the value stored at index i * is the tgid last observed corresponding to pid=i. */ static int *tgid_map; /* The maximum valid index into tgid_map. */ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX /* * Preemption must be disabled before acquiring trace_cmdline_lock. * The various trace_arrays' max_lock must be acquired in a context * where interrupt is disabled. */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; char *saved_cmdlines; }; static struct saved_cmdlines_buffer *savedcmd; static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; } static inline void set_cmdline(int idx, const char *cmdline) { strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static int allocate_cmdlines_buffer(unsigned int val, struct saved_cmdlines_buffer *s) { s->map_cmdline_to_pid = kmalloc_array(val, sizeof(*s->map_cmdline_to_pid), GFP_KERNEL); if (!s->map_cmdline_to_pid) return -ENOMEM; s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); if (!s->saved_cmdlines) { kfree(s->map_cmdline_to_pid); return -ENOMEM; } s->cmdline_idx = 0; s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); return 0; } static int trace_create_savedcmd(void) { int ret; savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); if (!savedcmd) return -ENOMEM; ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); if (ret < 0) { kfree(savedcmd); savedcmd = NULL; return -ENOMEM; } return 0; } int is_tracing_stopped(void) { return global_trace.stop_count; } /** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, * this will start the tracer back up. */ void tracing_start(void) { struct trace_buffer *buffer; unsigned long flags; if (tracing_disabled) return; raw_spin_lock_irqsave(&global_trace.start_lock, flags); if (--global_trace.stop_count) { if (global_trace.stop_count < 0) { /* Someone screwed up their debugging */ WARN_ON_ONCE(1); global_trace.stop_count = 0; } goto out; } /* Prevent the buffers from switching */ arch_spin_lock(&global_trace.max_lock); buffer = global_trace.array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif arch_spin_unlock(&global_trace.max_lock); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); } static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; if (tracing_disabled) return; /* If global, we need to also start the max tracer */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return tracing_start(); raw_spin_lock_irqsave(&tr->start_lock, flags); if (--tr->stop_count) { if (tr->stop_count < 0) { /* Someone screwed up their debugging */ WARN_ON_ONCE(1); tr->stop_count = 0; } goto out; } buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** * tracing_stop - quick stop of the tracer * * Light weight way to stop tracing. Use in conjunction with * tracing_start. */ void tracing_stop(void) { struct trace_buffer *buffer; unsigned long flags; raw_spin_lock_irqsave(&global_trace.start_lock, flags); if (global_trace.stop_count++) goto out; /* Prevent the buffers from switching */ arch_spin_lock(&global_trace.max_lock); buffer = global_trace.array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif arch_spin_unlock(&global_trace.max_lock); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); } static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; /* If global, we need to also stop the max tracer */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return tracing_stop(); raw_spin_lock_irqsave(&tr->start_lock, flags); if (tr->stop_count++) goto out; buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } static int trace_save_cmdline(struct task_struct *tsk) { unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. * * This is called within the scheduler and wake up, so interrupts * had better been disabled and run queue lock been held. */ lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); return 1; } static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; int tpid; if (!pid) { strcpy(comm, "<idle>"); return; } if (WARN_ON_ONCE(pid < 0)) { strcpy(comm, "<XXX>"); return; } tpid = pid & (PID_MAX_DEFAULT - 1); map = savedcmd->map_pid_to_cmdline[tpid]; if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) { preempt_disable(); arch_spin_lock(&trace_cmdline_lock); __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int *trace_find_tgid_ptr(int pid) { /* * Pairs with the smp_store_release in set_tracer_flag() to ensure that * if we observe a non-NULL tgid_map then we also observe the correct * tgid_map_max. */ int *map = smp_load_acquire(&tgid_map); if (unlikely(!map || pid > tgid_map_max)) return NULL; return &map[pid]; } int trace_find_tgid(int pid) { int *ptr = trace_find_tgid_ptr(pid); return ptr ? *ptr : 0; } static int trace_save_tgid(struct task_struct *tsk) { int *ptr; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; ptr = trace_find_tgid_ptr(tsk->pid); if (!ptr) return 0; *ptr = tsk->tgid; return 1; } static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; } /** * tracing_record_taskinfo - record the task info of a task * * @task: task to record * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * * @prev: previous task during sched_switch * @next: next task during sched_switch * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /* Helpers to record a specific task information */ void tracing_record_cmdline(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); } void tracing_record_tgid(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_TGID); } /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function * simplifies those functions and keeps them in sync. */ enum print_line_t trace_handle_return(struct trace_seq *s) { return trace_seq_has_overflowed(s) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; } EXPORT_SYMBOL_GPL(trace_handle_return); static unsigned short migration_disable_value(void) { #if defined(CONFIG_SMP) return current->migration_disabled; #else return 0; #endif } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { unsigned int trace_flags = irqs_status; unsigned int pc; pc = preempt_count(); if (pc & NMI_MASK) trace_flags |= TRACE_FLAG_NMI; if (pc & HARDIRQ_MASK) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx) { return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DEFINE_PER_CPU(int, trace_buffered_event_cnt); static int trace_buffered_event_ref; /** * trace_buffered_event_enable - enable buffering events * * When events are being filtered, it is quicker to use a temporary * buffer to write the event data into if there's a likely chance * that it will not be committed. The discard of the ring buffer * is not as fast as committing, and is much slower than copying * a commit. * * When an event is to be filtered, allocate per cpu buffers to * write the event data into, and if the event is filtered and discarded * it is simply dropped, otherwise, the entire data is to be committed * in one shot. */ void trace_buffered_event_enable(void) { struct ring_buffer_event *event; struct page *page; int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (trace_buffered_event_ref++) return; for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto failed; event = page_address(page); memset(event, 0, sizeof(*event)); per_cpu(trace_buffered_event, cpu) = event; preempt_disable(); if (cpu == smp_processor_id() && __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); preempt_enable(); } return; failed: trace_buffered_event_disable(); } static void enable_trace_buffered_event(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } static void disable_trace_buffered_event(void *data) { this_cpu_inc(trace_buffered_event_cnt); } /** * trace_buffered_event_disable - disable buffering events * * When a filter is removed, it is faster to not use the buffered * events, and to commit directly into the ring buffer. Free up * the temp buffers when there are no more users. This requires * special synchronization with current events. */ void trace_buffered_event_disable(void) { int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (WARN_ON_ONCE(!trace_buffered_event_ref)) return; if (--trace_buffered_event_ref) return; preempt_disable(); /* For each CPU, set the buffer as used. */ smp_call_function_many(tracing_buffer_mask, disable_trace_buffered_event, NULL, 1); preempt_enable(); /* Wait for all current users to finish */ synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } /* * Make sure trace_buffered_event is NULL before clearing * trace_buffered_event_cnt. */ smp_wmb(); preempt_disable(); /* Do the work on each cpu */ smp_call_function_many(tracing_buffer_mask, enable_trace_buffered_event, NULL, 1); preempt_enable(); } static struct trace_buffer *temp_buffer; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, unsigned int trace_ctx) { struct ring_buffer_event *entry; struct trace_array *tr = trace_file->tr; int val; *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, * where the type_len is zero and the array[0] will * hold the full length. * (see include/linux/ring-buffer.h for details on * how the ring_buffer_event is structured). * * Using a temp buffer during filtering and copying it * on a matched filter is quicker than writing directly * into the ring buffer and then discarding it when * it doesn't match. That is because the discard * requires several atomic operations to get right. * Copying on match and doing nothing on a failed match * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ if ((entry = __this_cpu_read(trace_buffered_event))) { int max_len = PAGE_SIZE - struct_size(entry, array, 1); val = this_cpu_inc_return(trace_buffered_event_cnt); /* * Preemption is disabled, but interrupts and NMIs * can still come in now. If that happens after * the above increment, then it will have to go * back to the old method of allocating the event * on the ring buffer, and if the filter fails, it * will have to call ring_buffer_discard_commit() * to remove it. * * Need to also check the unlikely case that the * length is bigger than the temp buffer size. * If that happens, then the reserve is pretty much * guaranteed to fail, as the ring buffer currently * only allows events less than a page. But that may * change in the future, so let the ring buffer reserve * handle the failure in that case. */ if (val == 1 && likely(len <= max_len)) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; /* Return with preemption disabled */ return entry; } this_cpu_dec(trace_buffered_event_cnt); } /* __trace_buffer_lock_reserve() disables preemption */ preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, trace_ctx); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; entry = __trace_buffer_lock_reserve(*current_rb, type, len, trace_ctx); } return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) { struct trace_event_call *event_call; struct trace_event_file *file; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; /* We should never get here if iter is NULL */ if (WARN_ON_ONCE(!iter)) return; event_call = fbuffer->trace_file->event_call; if (!event_call || !event_call->event.funcs || !event_call->event.funcs->trace) return; file = fbuffer->trace_file; if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, fbuffer->entry))) return; event = &fbuffer->trace_file->event_call->event; raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; int ret; mutex_lock(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * This will force exiting early, as tracepoint_printk * is always zero when tracepoint_printk_iter is not allocated */ if (!tracepoint_print_iter) tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) goto out; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); out: mutex_unlock(&tracepoint_printk_mutex); return ret; } void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { enum event_trigger_type tt = ETT_NONE; struct trace_event_file *file = fbuffer->trace_file; if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, fbuffer->entry, &tt)) goto discard; if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); if (static_branch_unlikely(&trace_event_exports_enabled)) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); discard: if (tt) event_triggers_post_call(file, tt); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); /* * Skip 3: * * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned int trace_ctx, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); /* * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); ftrace_trace_userstack(tr, buffer, trace_ctx); } /* * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. */ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event) { __buffer_unlock_commit(buffer, event); } void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx) { struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; if (!call_filter_check_discard(call, entry, buffer, event)) { if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } } #ifdef CONFIG_STACKTRACE /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 #define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; }; struct ftrace_stacks { struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; /* * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ #ifndef CONFIG_UNWINDER_ORC if (!regs) skip++; #endif preempt_disable_notrace(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* * The above __this_cpu_inc_return() is 'atomic' cpu local. An * interrupt will either see the value pre increment or post * increment. If the interrupt happens pre increment it will have * restored the counter when it returns. We just need a barrier to * keep gcc from moving things around. */ barrier(); fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; size = ARRAY_SIZE(fstack->calls); if (regs) { nr_entries = stack_trace_save_regs(regs, fstack->calls, size, skip); } else { nr_entries = stack_trace_save(fstack->calls, size, skip); } event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->size = nr_entries; memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip) { struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); return; } if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) return; /* * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; ct_irq_enter_irqson(); __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } /** * trace_dump_stack - record a stack back trace in the trace buffer * @skip: Number of functions to skip (helper handlers) */ void trace_dump_stack(int skip) { if (tracing_disabled || tracing_selftest_running) return; #ifndef CONFIG_UNWINDER_ORC /* Skip 1 to skip this function. */ skip++; #endif __ftrace_trace_stack(global_trace.array_buffer.buffer, tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); #ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* * NMIs can not handle page faults, even with fix ups. * The save user stack can (and often does) fault. */ if (unlikely(in_nmi())) return; /* * prevent recursion, since the user stack tracing may * trigger other kernel events. */ preempt_disable(); if (__this_cpu_read(user_stack_count)) goto out; __this_cpu_inc(user_stack_count); event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), trace_ctx); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); out: preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { } #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ static inline void func_repeats_set_delta_ts(struct func_repeats_entry *entry, unsigned long long delta) { entry->bottom_delta_ts = delta & U32_MAX; entry->top_delta_ts = (delta >> 32); } void trace_last_func_repeats(struct trace_array *tr, struct trace_func_repeats *last_info, unsigned int trace_ctx) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct func_repeats_entry *entry; struct ring_buffer_event *event; u64 delta; event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, sizeof(*entry), trace_ctx); if (!event) return; delta = ring_buffer_event_time_stamp(buffer, event) - last_info->ts_last_call; entry = ring_buffer_event_data(event); entry->ip = last_info->ip; entry->parent_ip = last_info->parent_ip; entry->count = last_info->count; func_repeats_set_delta_ts(entry, delta); __buffer_unlock_commit(buffer, event); } /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * This allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; /* Interrupts must see nesting incremented before we use the buffer */ barrier(); return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) { /* Don't let the decrement of nesting leak before this */ barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct __percpu *buffers; if (trace_percpu_buffer) return 0; buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; trace_percpu_buffer = buffers; return 0; } static int buffers_allocated; void trace_printk_init_buffers(void) { if (buffers_allocated) return; if (alloc_percpu_trace_buffer()) return; /* trace_printk() is for debug use only. Don't use it in production. */ pr_warn("\n"); pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); pr_warn("** **\n"); pr_warn("** This means that this is a DEBUG kernel and it is **\n"); pr_warn("** unsafe for production use. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your vendor! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); buffers_allocated = 1; /* * trace_printk_init_buffers() can be called by modules. * If that happens, then we need to start cmdline recording * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ if (global_trace.array_buffer.buffer) tracing_start_cmdline_record(); } EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { /* Start tracing comms if trace printk is set */ if (!buffers_allocated) return; tracing_start_cmdline_record(); } static void trace_printk_start_stop_comm(int enabled) { if (!buffers_allocated) return; if (enabled) tracing_start_cmdline_record(); else tracing_stop_cmdline_record(); } /** * trace_vbprintk - write binary msg to tracing buffer * @ip: The address of the caller * @fmt: The string format to write to the buffer * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = &global_trace; struct bprint_entry *entry; unsigned int trace_ctx; char *tbuffer; int len = 0, size; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) goto out_put; size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); } out: ring_buffer_nest_end(buffer); out_put: put_trace_buf(); out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); return len; } EXPORT_SYMBOL_GPL(trace_vbprintk); __printf(3, 0) static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; unsigned int trace_ctx; char *tbuffer; if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); } out: ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); return len; } __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { if (tracing_selftest_running && tr == &global_trace) return 0; return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } /** * trace_array_printk - Print a message to a specific instance * @tr: The instance trace_array descriptor * @ip: The instruction pointer that this is called from. * @fmt: The format to print (printf format) * * If a subsystem sets up its own instance, they have the right to * printk strings into their tracing instance buffer using this * function. Note, this function will not write into the top level * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. * * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ __printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return 0; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL_GPL(trace_array_printk); /** * trace_array_init_printk - Initialize buffers for trace_array_printk() * @tr: The trace array to initialize the buffers for * * As trace_array_printk() only writes into instances, they are OK to * have in the kernel (unlike trace_printk()). This needs to be called * before trace_array_printk() can be used on a trace_array. */ int trace_array_init_printk(struct trace_array *tr) { if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return -EINVAL; return alloc_percpu_trace_buffer(); } EXPORT_SYMBOL_GPL(trace_array_init_printk); __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = __trace_array_vprintk(buffer, ip, fmt, ap); va_end(ap); return ret; } __printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); iter->idx++; if (buf_iter) ring_buffer_iter_advance(buf_iter); } static struct trace_entry * peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); if (lost_events) *lost_events = ring_buffer_iter_dropped(buf_iter) ? (unsigned long)-1 : 0; } else { event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); } if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); } iter->ent_size = 0; return NULL; } static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { struct trace_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; int next_size = 0; int cpu; /* * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; return ent; } for_each_tracing_cpu(cpu) { if (ring_buffer_empty_cpu(buffer, cpu)) continue; ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: */ if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; next_ts = ts; next_lost = lost_events; next_size = iter->ent_size; } } iter->ent_size = next_size; if (ent_cpu) *ent_cpu = next_cpu; if (ent_ts) *ent_ts = next_ts; if (missing_events) *missing_events = next_lost; return next; } #define STATIC_FMT_BUF_SIZE 128 static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; /* * iter->tr is NULL when used with tp_printk, which makes * this get called where it is not safe to call krealloc(). */ if (!iter->tr || iter->fmt == static_fmt_buf) return NULL; tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, GFP_KERNEL); if (tmp) { iter->fmt_size += STATIC_FMT_BUF_SIZE; iter->fmt = tmp; } return tmp; } /* Returns true if the string is safe to dereference from an event */ static bool trace_safe_str(struct trace_iterator *iter, const char *str, bool star, int len) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; /* Ignore strings with no length */ if (star && !len) return true; /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) return true; /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) return true; /* Core rodata can not be freed */ if (is_kernel_rodata(addr)) return true; if (trace_is_tracepoint_string(str)) return true; /* * Now this could be a module event, referencing core module * data, which is OK. */ if (!iter->ent) return false; trace_event = ftrace_find_event(iter->ent->type); if (!trace_event) return false; event = container_of(trace_event, struct trace_event_call, event); if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ if (within_module_core(addr, event->module)) return true; return false; } static const char *show_buffer(struct trace_seq *s) { struct seq_buf *seq = &s->seq; seq_buf_terminate(seq); return seq->buffer; } static DEFINE_STATIC_KEY_FALSE(trace_no_verify); static int test_can_verify_check(const char *fmt, ...) { char buf[16]; va_list ap; int ret; /* * The verifier is dependent on vsnprintf() modifies the va_list * passed to it, where it is sent as a reference. Some architectures * (like x86_32) passes it by value, which means that vsnprintf() * does not modify the va_list passed to it, and the verifier * would then need to be able to understand all the values that * vsnprintf can use. If it is passed by value, then the verifier * is disabled. */ va_start(ap, fmt); vsnprintf(buf, 16, "%d", ap); ret = va_arg(ap, int); va_end(ap); return ret; } static void test_can_verify(void) { if (!test_can_verify_check("%d %d", 0, 1)) { pr_info("trace event string verifier disabled\n"); static_branch_inc(&trace_no_verify); } } /** * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed * @fmt: The format used to print the event * @ap: The va_list holding the data to print from @fmt. * * This writes the data into the @iter->seq buffer using the data from * @fmt and @ap. If the format has a %s, then the source of the string * is examined to make sure it is safe to print, otherwise it will * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string * pointer. */ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { const char *p = fmt; const char *str; int i, j; if (WARN_ON_ONCE(!fmt)) return; if (static_branch_unlikely(&trace_no_verify)) goto print; /* Don't bother checking when doing a ftrace_dump() */ if (iter->fmt == static_fmt_buf) goto print; while (*p) { bool star = false; int len = 0; j = 0; /* We only care about %s and variants */ for (i = 0; p[i]; i++) { if (i + 1 >= iter->fmt_size) { /* * If we can't expand the copy buffer, * just print it. */ if (!trace_iter_expand_format(iter)) goto print; } if (p[i] == '\\' && p[i+1]) { i++; continue; } if (p[i] == '%') { /* Need to test cases like %08.*s */ for (j = 1; p[i+j]; j++) { if (isdigit(p[i+j]) || p[i+j] == '.') continue; if (p[i+j] == '*') { star = true; continue; } break; } if (p[i+j] == 's') break; star = false; } j = 0; } /* If no %s found then just print normally */ if (!p[i]) break; /* Copy up to the %s, and print that */ strncpy(iter->fmt, p, i); iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); /* * If iter->seq is full, the above call no longer guarantees * that ap is in sync with fmt processing, and further calls * to va_arg() can return wrong positional arguments. * * Ensure that ap is no longer used in this case. */ if (iter->seq.full) { p = ""; break; } if (star) len = va_arg(ap, int); /* The ap now points to the string data of the %s */ str = va_arg(ap, const char *); /* * If you hit this warning, it is likely that the * trace event in question used %s on a string that * was saved at the time of the event, but may not be * around when the trace is read. Use __string(), * __assign_str() and __get_str() helpers in the TRACE_EVENT() * instead. See samples/trace_events/trace-events-sample.h * for reference. */ if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", fmt, show_buffer(&iter->seq))) { int ret; /* Try to safely read the string */ if (star) { if (len + 1 > iter->fmt_size) len = iter->fmt_size - 1; if (len < 0) len = 0; ret = copy_from_kernel_nofault(iter->fmt, str, len); iter->fmt[len] = 0; star = false; } else { ret = strncpy_from_kernel_nofault(iter->fmt, str, iter->fmt_size); } if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", str); else trace_seq_printf(&iter->seq, "(0x%px:%s)", str, iter->fmt); str = "[UNSAFE-MEMORY]"; strcpy(iter->fmt, "%s"); } else { strncpy(iter->fmt, p + i, j + 1); iter->fmt[j+1] = '\0'; } if (star) trace_seq_printf(&iter->seq, iter->fmt, len, str); else trace_seq_printf(&iter->seq, iter->fmt, str); p += i + j + 1; } print: if (*p) trace_seq_vprintf(&iter->seq, p, ap); } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; char *q; if (WARN_ON_ONCE(!fmt)) return fmt; if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) return fmt; p = fmt; new_fmt = q = iter->fmt; while (*p) { if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { if (!trace_iter_expand_format(iter)) return fmt; q += iter->fmt - new_fmt; new_fmt = iter->fmt; } *q++ = *p++; /* Replace %p with %px */ if (p[-1] == '%') { if (p[0] == '%') { *q++ = *p++; } else if (p[0] == 'p' && !isalnum(p[1])) { *q++ = *p++; *q++ = 'x'; } } } *q = '\0'; return new_fmt; } #define STATIC_TEMP_BUF_SIZE 128 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { /* __find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_entry *entry; /* * If called from ftrace_dump(), then the iter->temp buffer * will be the static_temp_buf and not created from kmalloc. * If the entry size is greater than the buffer, we can * not save it. Just return NULL in that case. This is only * used to add markers when two consecutive events' time * stamps have a large delta. See trace_print_lat_context() */ if (iter->temp == static_temp_buf && STATIC_TEMP_BUF_SIZE < ent_size) return NULL; /* * The __find_next_entry() may call peek_next_entry(), which may * call ring_buffer_peek() that may make the contents of iter->ent * undefined. Need to copy iter->ent now. */ if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { void *temp; temp = kmalloc(iter->ent_size, GFP_KERNEL); if (!temp) return NULL; kfree(iter->temp); iter->temp = temp; iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); /* Put back the original ent_size */ iter->ent_size = ent_size; return entry; } /* Find the next real entry, and increment the iterator to the next entry */ void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); return iter->ent ? iter : NULL; } static void trace_consume(struct trace_iterator *iter) { ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; int i = (int)*pos; void *ent; WARN_ON_ONCE(iter->leftover); (*pos)++; /* can't go backwards */ if (iter->idx > i) return NULL; if (iter->idx < 0) ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) return; ring_buffer_iter_reset(buf_iter); /* * We could have the case with the max latency tracers * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ while (ring_buffer_iter_peek(buf_iter, &ts)) { if (ts >= iter->array_buffer->time_start) break; entries++; ring_buffer_iter_advance(buf_iter); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; } /* * The current tracer is copied to avoid a global locking * all around. */ static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; mutex_lock(&trace_types_lock); if (unlikely(tr->current_trace != iter->trace)) { /* Close iter->trace before switching to the new current tracer */ if (iter->trace->close) iter->trace->close(iter); iter->trace = tr->current_trace; /* Reopen the new current tracer */ if (iter->trace->open) iter->trace->open(iter); } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); #endif if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; iter->idx = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { /* * If we overflowed the seq_file before, then we want * to just reuse the trace_seq buffer again. */ if (iter->leftover) p = iter; else { l = *pos - 1; p = s_next(m, p, &l); } } trace_event_read_lock(); trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; #endif trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, unsigned long *entries, int cpu) { unsigned long count; count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total = count; } else *total = count + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries = count; } static void get_total_entries(struct array_buffer *buf, unsigned long *total, unsigned long *entries) { unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { get_total_entries_cpu(buf, &t, &e, cpu); *total += t; |