Total coverage: 13600 (2%)of 1199026
34 687 637 212 568 322 277 45 45 239 11 176 274 8 186 547 339 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ !list_is_head(pos, (head)); \ pos = rcu_dereference(pos->next)) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * hlist_count_nodes - count nodes in the hlist * @head: the head for your hlist. */ static inline size_t hlist_count_nodes(struct hlist_head *head) { struct hlist_node *pos; size_t count = 0; hlist_for_each(pos, head) count++; return count; } #endif
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* * llc_core.c - Minimum needed routines for sap handling and module init/exit * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/llc.h> LIST_HEAD(llc_sap_list); static DEFINE_SPINLOCK(llc_sap_list_lock); /** * llc_sap_alloc - allocates and initializes sap. * * Allocates and initializes sap. */ static struct llc_sap *llc_sap_alloc(void) { struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC); int i; if (sap) { /* sap->laddr.mac - leave as a null, it's filled by bind */ sap->state = LLC_SAP_STATE_ACTIVE; spin_lock_init(&sap->sk_lock); for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i); refcount_set(&sap->refcnt, 1); } return sap; } static struct llc_sap *__llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; list_for_each_entry(sap, &llc_sap_list, node) if (sap->laddr.lsap == sap_value) goto out; sap = NULL; out: return sap; } /** * llc_sap_find - searches a SAP in station * @sap_value: sap to be found * * Searches for a sap in the sap list of the LLC's station upon the sap ID. * If the sap is found it will be refcounted and the user will have to do * a llc_sap_put after use. * Returns the sap or %NULL if not found. */ struct llc_sap *llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; rcu_read_lock_bh(); sap = __llc_sap_find(sap_value); if (!sap || !llc_sap_hold_safe(sap)) sap = NULL; rcu_read_unlock_bh(); return sap; } /** * llc_sap_open - open interface to the upper layers. * @lsap: SAP number. * @func: rcv func for datalink protos * * Interface function to upper layer. Each one who wants to get a SAP * (for example NetBEUI) should call this function. Returns the opened * SAP for success, NULL for failure. */ struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)) { struct llc_sap *sap = NULL; spin_lock_bh(&llc_sap_list_lock); if (__llc_sap_find(lsap)) /* SAP already exists */ goto out; sap = llc_sap_alloc(); if (!sap) goto out; sap->laddr.lsap = lsap; sap->rcv_func = func; list_add_tail_rcu(&sap->node, &llc_sap_list); out: spin_unlock_bh(&llc_sap_list_lock); return sap; } /** * llc_sap_close - close interface for upper layers. * @sap: SAP to be closed. * * Close interface function to upper layer. Each one who wants to * close an open SAP (for example NetBEUI) should call this function. * Removes this sap from the list of saps in the station and then * frees the memory for this sap. */ void llc_sap_close(struct llc_sap *sap) { WARN_ON(sap->sk_count); spin_lock_bh(&llc_sap_list_lock); list_del_rcu(&sap->node); spin_unlock_bh(&llc_sap_list_lock); kfree_rcu(sap, rcu); } static struct packet_type llc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_2), .func = llc_rcv, }; static int __init llc_init(void) { dev_add_pack(&llc_packet_type); return 0; } static void __exit llc_exit(void) { dev_remove_pack(&llc_packet_type); } module_init(llc_init); module_exit(llc_exit); EXPORT_SYMBOL(llc_sap_list); EXPORT_SYMBOL(llc_sap_find); EXPORT_SYMBOL(llc_sap_open); EXPORT_SYMBOL(llc_sap_close); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("LLC IEEE 802.2 core support");
223 223 223 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"}, \ {I_LINKABLE, "I_LINKABLE"}, \ {I_WB_SWITCH, "I_WB_SWITCH"}, \ {I_OVL_INUSE, "I_OVL_INUSE"}, \ {I_CREATING, "I_CREATING"}, \ {I_DONTCACHE, "I_DONTCACHE"}, \ {I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \ {I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \ {I_LRU_ISOLATING, "I_LRU_ISOLATING"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_folio_template, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct folio *folio, struct bdi_writeback *wb), TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, bdi_setpoint) __field(unsigned long, bdi_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); __entry->bdi_dirty = bdi_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->bdi_setpoint, __entry->bdi_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include "ipvlan.h" #include <linux/if_vlan.h> #include <linux/if_tap.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static dev_t ipvtap_major; static struct cdev ipvtap_cdev; static const void *ipvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class ipvtap_class = { .name = "ipvtap", .ns_type = &net_ns_type_operations, .namespace = ipvtap_net_namespace, }; struct ipvtap_dev { struct ipvl_dev vlan; struct tap_dev tap; }; static void ipvtap_count_tx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_drps); } static void ipvtap_count_rx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; ipvlan_count_rx(vlan, 0, 0, 0); } static void ipvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; vlan->sfeatures = features; netdev_update_features(vlan->dev); } static int ipvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; vlantap->tap.count_tx_dropped = ipvtap_count_tx_dropped; vlantap->tap.update_features = ipvtap_update_features; vlantap->tap.count_rx_dropped = ipvtap_count_rx_dropped; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = ipvlan_link_new(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return err; } static void ipvtap_dellink(struct net_device *dev, struct list_head *head) { struct ipvtap_dev *vlan = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlan->tap); ipvlan_link_delete(dev, head); } static void ipvtap_setup(struct net_device *dev) { ipvlan_link_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; dev->priv_flags &= ~IFF_NO_QUEUE; } static struct rtnl_link_ops ipvtap_link_ops __read_mostly = { .kind = "ipvtap", .setup = ipvtap_setup, .newlink = ipvtap_newlink, .dellink = ipvtap_dellink, .priv_size = sizeof(struct ipvtap_dev), }; static int ipvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &ipvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(ipvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); classdev = device_create(&ipvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(ipvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); device_destroy(&ipvtap_class, devt); tap_free_minor(ipvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block ipvtap_notifier_block __read_mostly = { .notifier_call = ipvtap_device_event, }; static int __init ipvtap_init(void) { int err; err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap", THIS_MODULE); if (err) goto out1; err = class_register(&ipvtap_class); if (err) goto out2; err = register_netdevice_notifier(&ipvtap_notifier_block); if (err) goto out3; err = ipvlan_link_register(&ipvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&ipvtap_notifier_block); out3: class_unregister(&ipvtap_class); out2: tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); out1: return err; } module_init(ipvtap_init); static void __exit ipvtap_exit(void) { rtnl_link_unregister(&ipvtap_link_ops); unregister_netdevice_notifier(&ipvtap_notifier_block); class_unregister(&ipvtap_class); tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); } module_exit(ipvtap_exit); MODULE_ALIAS_RTNL_LINK("ipvtap"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_DESCRIPTION("IP-VLAN based tap driver"); MODULE_LICENSE("GPL");
14 237 14 237 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 /* SPDX-License-Identifier: GPL-2.0 */ /* * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk). * * (C) SGI 2006, Christoph Lameter * Cleaned up and restructured to ease the addition of alternative * implementations of SLAB allocators. * (C) Linux Foundation 2008-2013 * Unified interface for all slab allocators */ #ifndef _LINUX_SLAB_H #define _LINUX_SLAB_H #include <linux/cache.h> #include <linux/gfp.h> #include <linux/overflow.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/percpu-refcount.h> #include <linux/cleanup.h> #include <linux/hash.h> enum _slab_flag_bits { _SLAB_CONSISTENCY_CHECKS, _SLAB_RED_ZONE, _SLAB_POISON, _SLAB_KMALLOC, _SLAB_HWCACHE_ALIGN, _SLAB_CACHE_DMA, _SLAB_CACHE_DMA32, _SLAB_STORE_USER, _SLAB_PANIC, _SLAB_TYPESAFE_BY_RCU, _SLAB_TRACE, #ifdef CONFIG_DEBUG_OBJECTS _SLAB_DEBUG_OBJECTS, #endif _SLAB_NOLEAKTRACE, _SLAB_NO_MERGE, #ifdef CONFIG_FAILSLAB _SLAB_FAILSLAB, #endif #ifdef CONFIG_MEMCG _SLAB_ACCOUNT, #endif #ifdef CONFIG_KASAN_GENERIC _SLAB_KASAN, #endif _SLAB_NO_USER_FLAGS, #ifdef CONFIG_KFENCE _SLAB_SKIP_KFENCE, #endif #ifndef CONFIG_SLUB_TINY _SLAB_RECLAIM_ACCOUNT, #endif _SLAB_OBJECT_POISON, _SLAB_CMPXCHG_DOUBLE, #ifdef CONFIG_SLAB_OBJ_EXT _SLAB_NO_OBJ_EXT, #endif _SLAB_FLAGS_LAST_BIT }; #define __SLAB_FLAG_BIT(nr) ((slab_flags_t __force)(1U << (nr))) #define __SLAB_FLAG_UNUSED ((slab_flags_t __force)(0U)) /* * Flags to pass to kmem_cache_create(). * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS) /* DEBUG: Red zone objs in a cache */ #define SLAB_RED_ZONE __SLAB_FLAG_BIT(_SLAB_RED_ZONE) /* DEBUG: Poison objects */ #define SLAB_POISON __SLAB_FLAG_BIT(_SLAB_POISON) /* Indicate a kmalloc slab */ #define SLAB_KMALLOC __SLAB_FLAG_BIT(_SLAB_KMALLOC) /** * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. * * Sufficiently large objects are aligned on cache line boundary. For object * size smaller than a half of cache line size, the alignment is on the half of * cache line size. In general, if object size is smaller than 1/2^n of cache * line size, the alignment is adjusted to 1/2^n. * * If explicit alignment is also requested by the respective * &struct kmem_cache_args field, the greater of both is alignments is applied. */ #define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN) /* Use GFP_DMA memory */ #define SLAB_CACHE_DMA __SLAB_FLAG_BIT(_SLAB_CACHE_DMA) /* Use GFP_DMA32 memory */ #define SLAB_CACHE_DMA32 __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32) /* DEBUG: Store the last owner for bug hunting */ #define SLAB_STORE_USER __SLAB_FLAG_BIT(_SLAB_STORE_USER) /* Panic if kmem_cache_create() fails */ #define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC) /** * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() * that memory location is free to be reused at any time. Thus it may * be possible to see another object there in the same RCU grace period. * * This feature only ensures the memory location backing the object * stays valid, the trick to using this is relying on an independent * object validation pass. Something like: * * :: * * begin: * rcu_read_lock(); * obj = lockless_lookup(key); * if (obj) { * if (!try_get_ref(obj)) // might fail for free objects * rcu_read_unlock(); * goto begin; * * if (obj->key != key) { // not the object we expected * put_ref(obj); * rcu_read_unlock(); * goto begin; * } * } * rcu_read_unlock(); * * This is useful if we need to approach a kernel structure obliquely, * from its address obtained without the usual locking. We can lock * the structure to stabilize it and check it's still at the given address, * only if we can be sure that the memory has not been meanwhile reused * for some other kind of object (which our subsystem's lock might corrupt). * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * * Note that it is not possible to acquire a lock within a structure * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages * are not zeroed before being given to the slab, which means that any * locks must be initialized after each and every kmem_struct_alloc(). * Alternatively, make the ctor passed to kmem_cache_create() initialize * the locks at page-allocation time, as is done in __i915_request_ctor(), * sighand_ctor(), and anon_vma_ctor(). Such a ctor permits readers * to safely acquire those ctor-initialized locks under rcu_read_lock() * protection. * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ #define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU) /* Trace allocations and frees */ #define SLAB_TRACE __SLAB_FLAG_BIT(_SLAB_TRACE) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS # define SLAB_DEBUG_OBJECTS __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS) #else # define SLAB_DEBUG_OBJECTS __SLAB_FLAG_UNUSED #endif /* Avoid kmemleak tracing */ #define SLAB_NOLEAKTRACE __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE) /* * Prevent merging with compatible kmem caches. This flag should be used * cautiously. Valid use cases: * * - caches created for self-tests (e.g. kunit) * - general caches created and used by a subsystem, only when a * (subsystem-specific) debug option is enabled * - performance critical caches, should be very rare and consulted with slab * maintainers, and not used together with CONFIG_SLUB_TINY */ #define SLAB_NO_MERGE __SLAB_FLAG_BIT(_SLAB_NO_MERGE) /* Fault injection mark */ #ifdef CONFIG_FAILSLAB # define SLAB_FAILSLAB __SLAB_FLAG_BIT(_SLAB_FAILSLAB) #else # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /** * define SLAB_ACCOUNT - Account allocations to memcg. * * All object allocations from this cache will be memcg accounted, regardless of * __GFP_ACCOUNT being or not being passed to individual allocations. */ #ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED #endif #ifdef CONFIG_KASAN_GENERIC #define SLAB_KASAN __SLAB_FLAG_BIT(_SLAB_KASAN) #else #define SLAB_KASAN __SLAB_FLAG_UNUSED #endif /* * Ignore user specified debugging flags. * Intended for caches created for self-tests so they have only flags * specified in the code and other flags are ignored. */ #define SLAB_NO_USER_FLAGS __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS) #ifdef CONFIG_KFENCE #define SLAB_SKIP_KFENCE __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE) #else #define SLAB_SKIP_KFENCE __SLAB_FLAG_UNUSED #endif /* The following flags affect the page allocator grouping pages by mobility */ /** * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. * * Use this flag for caches that have an associated shrinker. As a result, slab * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by * mobility, and are accounted in SReclaimable counter in /proc/meminfo */ #ifndef CONFIG_SLUB_TINY #define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT) #else #define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_UNUSED #endif #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Slab created using create_boot_cache */ #ifdef CONFIG_SLAB_OBJ_EXT #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) #else #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif /* * freeptr_t represents a SLUB freelist pointer, which might be encoded * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. */ typedef struct { unsigned long v; } freeptr_t; /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault. * * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can. * Both make kfree a no-op. */ #define ZERO_SIZE_PTR ((void *)16) #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ (unsigned long)ZERO_SIZE_PTR) #include <linux/kasan.h> struct list_lru; struct mem_cgroup; /* * struct kmem_cache related prototypes */ bool slab_is_available(void); /** * struct kmem_cache_args - Less common arguments for kmem_cache_create() * * Any uninitialized fields of the structure are interpreted as unused. The * exception is @freeptr_offset where %0 is a valid value, so * @use_freeptr_offset must be also set to %true in order to interpret the field * as used. For @useroffset %0 is also valid, but only with non-%0 * @usersize. * * When %NULL args is passed to kmem_cache_create(), it is equivalent to all * fields unused. */ struct kmem_cache_args { /** * @align: The required alignment for the objects. * * %0 means no specific alignment is requested. */ unsigned int align; /** * @useroffset: Usercopy region offset. * * %0 is a valid offset, when @usersize is non-%0 */ unsigned int useroffset; /** * @usersize: Usercopy region size. * * %0 means no usercopy region is specified. */ unsigned int usersize; /** * @freeptr_offset: Custom offset for the free pointer * in &SLAB_TYPESAFE_BY_RCU caches * * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer * outside of the object. This might cause the object to grow in size. * Cache creators that have a reason to avoid this can specify a custom * free pointer offset in their struct where the free pointer will be * placed. * * Note that placing the free pointer inside the object requires the * caller to ensure that no fields are invalidated that are required to * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for * details). * * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset * is specified, %use_freeptr_offset must be set %true. * * Note that @ctor currently isn't supported with custom free pointers * as a @ctor requires an external free pointer. */ unsigned int freeptr_offset; /** * @use_freeptr_offset: Whether a @freeptr_offset is used. */ bool use_freeptr_offset; /** * @ctor: A constructor for the objects. * * The constructor is invoked for each object in a newly allocated slab * page. It is the cache user's responsibility to free object in the * same state as after calling the constructor, or deal appropriately * with any differences between a freshly constructed and a reallocated * object. * * %NULL means no constructor. */ void (*ctor)(void *); }; struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); static inline struct kmem_cache * __kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache_args kmem_args = { .align = align, .ctor = ctor, }; return __kmem_cache_create_args(name, size, &kmem_args, flags); } /** * kmem_cache_create_usercopy - Create a kmem cache with a region suitable * for copying to userspace. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. * @flags: SLAB flags * @useroffset: Usercopy region offset * @usersize: Usercopy region size * @ctor: A constructor for the objects, or %NULL. * * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY() * if whitelisting a single field is sufficient, or kmem_cache_create() with * the necessary parameters passed via the args parameter (see * &struct kmem_cache_args) * * Return: a pointer to the cache on success, NULL on failure. */ static inline struct kmem_cache * kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache_args kmem_args = { .align = align, .ctor = ctor, .useroffset = useroffset, .usersize = usersize, }; return __kmem_cache_create_args(name, size, &kmem_args, flags); } /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * __kmem_cache_default_args(const char *name, unsigned int size, struct kmem_cache_args *args, slab_flags_t flags) { struct kmem_cache_args kmem_default_args = {}; /* Make sure we don't get passed garbage. */ if (WARN_ON_ONCE(args)) return ERR_PTR(-EINVAL); return __kmem_cache_create_args(name, size, &kmem_default_args, flags); } /** * kmem_cache_create - Create a kmem cache. * @__name: A string which is used in /proc/slabinfo to identify this cache. * @__object_size: The size of objects to be created in this cache. * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL * means defaults will be used for all the arguments. * * This is currently implemented as a macro using ``_Generic()`` to call * either the new variant of the function, or a legacy one. * * The new variant has 4 parameters: * ``kmem_cache_create(name, object_size, args, flags)`` * * See __kmem_cache_create_args() which implements this. * * The legacy variant has 5 parameters: * ``kmem_cache_create(name, object_size, align, flags, ctor)`` * * The align and ctor parameters map to the respective fields of * &struct kmem_cache_args * * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. */ #define kmem_cache_create(__name, __object_size, __args, ...) \ _Generic((__args), \ struct kmem_cache_args *: __kmem_cache_create_args, \ void *: __kmem_cache_default_args, \ default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); /* * Please use this macro to create slab caches. Simply specify the * name of the structure and maybe some flags that are listed above. * * The alignment of the struct determines object alignment. If you * f.e. add ____cacheline_aligned_in_smp to the struct declaration * then the objects will be properly aligned in SMP configurations. */ #define KMEM_CACHE(__struct, __flags) \ __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ &(struct kmem_cache_args) { \ .align = __alignof__(struct __struct), \ }, (__flags)) /* * To whitelist a single field for copying to/from usercopy, use this * macro instead for KMEM_CACHE() above. */ #define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ &(struct kmem_cache_args) { \ .align = __alignof__(struct __struct), \ .useroffset = offsetof(struct __struct, __field), \ .usersize = sizeof_field(struct __struct, __field), \ }, (__flags)) /* * Common kmalloc functions provided by all allocators */ void * __must_check krealloc_noprof(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) /** * ksize - Report actual allocation size of associated object * * @objp: Pointer returned from a prior kmalloc()-family allocation. * * This should not be used for writing beyond the originally requested * allocation size. Either use krealloc() or round up the allocation size * with kmalloc_size_roundup() prior to allocation. If this is used to * access beyond the originally requested allocation size, UBSAN_BOUNDS * and/or FORTIFY_SOURCE may trip, since they only know about the * originally allocated size via the __alloc_size attribute. */ size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_dump_obj(void *object); #else static inline bool kmem_dump_obj(void *object) { return false; } #endif /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. * Setting ARCH_DMA_MINALIGN in arch headers allows that. */ #ifdef ARCH_HAS_DMA_MINALIGN #if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN) #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #endif #endif #ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #elif ARCH_KMALLOC_MINALIGN > 8 #define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) #endif /* * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment. * Intended for arches that get misalignment faults even for 64 bit integer * aligned buffers. */ #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif /* * Arches can define this function if they want to decide the minimum slab * alignment at runtime. The value returned by the function must be a power * of two and >= ARCH_SLAB_MINALIGN. */ #ifndef arch_slab_minalign static inline unsigned int arch_slab_minalign(void) { return ARCH_SLAB_MINALIGN; } #endif /* * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN. * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment. */ #define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN) #define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN) #define __assume_page_alignment __assume_aligned(PAGE_SIZE) /* * Kmalloc array related definitions */ /* * SLUB directly allocates requests fitting in to an order-1 page * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif /* Maximum allocatable size */ #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) /* Maximum size for which we actually use a slab cache */ #define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH) /* Maximum order allocatable via the slab allocator */ #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT) /* * Kmalloc subsystem. */ #ifndef KMALLOC_MIN_SIZE #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW) #endif /* * This restriction comes from byte sized index implementation. * Page size is normally 2^12 bytes and, in this case, if we want to use * byte sized index which can represent 2^8 entries, the size of the object * should be equal or greater to 2^12 / 2^8 = 2^4 = 16. * If minimum size of kmalloc is less than 16, we use it as minimum object * size and give up to use byte sized index. */ #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) #ifdef CONFIG_RANDOM_KMALLOC_CACHES #define RANDOM_KMALLOC_CACHES_NR 15 // # of cache copies #else #define RANDOM_KMALLOC_CACHES_NR 0 #endif /* * Whenever changing this, take care of that kmalloc_type() and * create_kmalloc_caches() still work as intended. * * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP * is for accounted but unreclaimable and non-dma objects. All the other * kmem caches can have both accounted and unaccounted objects. */ enum kmalloc_cache_type { KMALLOC_NORMAL = 0, #ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL, #endif #ifndef CONFIG_MEMCG KMALLOC_CGROUP = KMALLOC_NORMAL, #endif KMALLOC_RANDOM_START = KMALLOC_NORMAL, KMALLOC_RANDOM_END = KMALLOC_RANDOM_START + RANDOM_KMALLOC_CACHES_NR, #ifdef CONFIG_SLUB_TINY KMALLOC_RECLAIM = KMALLOC_NORMAL, #else KMALLOC_RECLAIM, #endif #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif #ifdef CONFIG_MEMCG KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES }; typedef struct kmem_cache * kmem_buckets[KMALLOC_SHIFT_HIGH + 1]; extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; /* * Define gfp bits that should not be set for KMALLOC_NORMAL. */ #define KMALLOC_NOT_NORMAL_BITS \ (__GFP_RECLAIMABLE | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0)) extern unsigned long random_kmalloc_seed; static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigned long caller) { /* * The most common case is KMALLOC_NORMAL, so test for it * with a single branch for all the relevant flags. */ if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) #ifdef CONFIG_RANDOM_KMALLOC_CACHES /* RANDOM_KMALLOC_CACHES_NR (=15) copies + the KMALLOC_NORMAL */ return KMALLOC_RANDOM_START + hash_64(caller ^ random_kmalloc_seed, ilog2(RANDOM_KMALLOC_CACHES_NR + 1)); #else return KMALLOC_NORMAL; #endif /* * At least one of the flags has to be set. Their priorities in * decreasing order are: * 1) __GFP_DMA * 2) __GFP_RECLAIMABLE * 3) __GFP_ACCOUNT */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else return KMALLOC_CGROUP; } /* * Figure out which kmalloc slab an allocation of a certain size * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes * 2 = 129 .. 192 bytes * n = 2^(n-1)+1 .. 2^n * * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized; * typical usage is via kmalloc_index() and therefore evaluated at compile-time. * Callers where !size_is_constant should only be test modules, where runtime * overheads of __kmalloc_index() can be tolerated. Also see kmalloc_slab(). */ static __always_inline unsigned int __kmalloc_index(size_t size, bool size_is_constant) { if (!size) return 0; if (size <= KMALLOC_MIN_SIZE) return KMALLOC_SHIFT_LOW; if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96) return 1; if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192) return 2; if (size <= 8) return 3; if (size <= 16) return 4; if (size <= 32) return 5; if (size <= 64) return 6; if (size <= 128) return 7; if (size <= 256) return 8; if (size <= 512) return 9; if (size <= 1024) return 10; if (size <= 2 * 1024) return 11; if (size <= 4 * 1024) return 12; if (size <= 8 * 1024) return 13; if (size <= 16 * 1024) return 14; if (size <= 32 * 1024) return 15; if (size <= 64 * 1024) return 16; if (size <= 128 * 1024) return 17; if (size <= 256 * 1024) return 18; if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); else BUG(); /* Will never be reached. Needed because the compiler may complain */ return -1; } static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) #include <linux/alloc_tag.h> /** * kmem_cache_alloc - Allocate an object * @cachep: The cache to allocate from. * @flags: See kmalloc(). * * Allocate an object from this cache. * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags. * * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; #define kmem_cache_alloc(...) alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__)) void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) __assume_slab_alignment __malloc; #define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) /** * kmem_cache_charge - memcg charge an already allocated slab memory * @objp: address of the slab object to memcg charge * @gfpflags: describe the allocation context * * kmem_cache_charge allows charging a slab object to the current memcg, * primarily in cases where charging at allocation time might not be possible * because the target memcg is not known (i.e. softirq context) * * The objp should be pointer returned by the slab allocator functions like * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge * behavior can be controlled through gfpflags parameter, which affects how the * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes * that overcharging is requested instead of failure, but is not applied for the * internal metadata allocation. * * There are several cases where it will return true even if the charging was * not done: * More specifically: * * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems. * 2. Already charged slab objects. * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc() * without __GFP_ACCOUNT * 4. Allocating internal metadata has failed * * Return: true if charge was successful otherwise false. */ bool kmem_cache_charge(void *objp, gfp_t gfpflags); void kmem_cache_free(struct kmem_cache *s, void *objp); kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); /* * Bulk allocation and freeing operations. These are accelerated in an * allocator specific way to avoid taking locks repeatedly or building * metadata structures unnecessarily. * * Note that interrupts must be enabled when calling these functions. */ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p); #define kmem_cache_alloc_bulk(...) alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__)) static __always_inline void kfree_bulk(size_t size, void **p) { kmem_cache_free_bulk(NULL, size, p); } void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) /* * These macros allow declaring a kmem_buckets * parameter alongside size, which * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call * sites don't have to pass NULL. */ #ifdef CONFIG_SLAB_BUCKETS #define DECL_BUCKET_PARAMS(_size, _b) size_t (_size), kmem_buckets *(_b) #define PASS_BUCKET_PARAMS(_size, _b) (_size), (_b) #define PASS_BUCKET_PARAM(_b) (_b) #else #define DECL_BUCKET_PARAMS(_size, _b) size_t (_size) #define PASS_BUCKET_PARAMS(_size, _b) (_size) #define PASS_BUCKET_PARAM(_b) NULL #endif /* * The following functions are not to be used directly and are intended only * for internal use from kmalloc() and kmalloc_node() * with the exception of kunit tests */ void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t flags, size_t size) __assume_kmalloc_alignment __alloc_size(3); void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) __assume_kmalloc_alignment __alloc_size(4); void *__kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment __alloc_size(1); void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment __alloc_size(1); /** * kmalloc - allocate kernel memory * @size: how many bytes of memory are required. * @flags: describe the allocation context * * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. * * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN * bytes. For @size of power of two bytes, the alignment is also guaranteed * to be at least to the size. For other sizes, the alignment is guaranteed to * be at least the largest power-of-two divisor of @size. * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp_types.h and described at * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>` * * The recommended usage of the @flags is described at * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>` * * Below is a brief outline of the most useful GFP flags * * %GFP_KERNEL * Allocate normal kernel ram. May sleep. * * %GFP_NOWAIT * Allocation will not sleep. * * %GFP_ATOMIC * Allocation will not sleep. May use emergency pools. * * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * * %__GFP_ZERO * Zero the allocated memory before returning. Also see kzalloc(). * * %__GFP_HIGH * This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL * Indicate that this allocation is in no way allowed to fail * (think twice before using). * * %__GFP_NORETRY * If memory is not immediately available, * then give up at once. * * %__GFP_NOWARN * If allocation fails, don't issue any warnings. * * %__GFP_RETRY_MAYFAIL * Try really hard to succeed the allocation but fail * eventually. */ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags) { if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) return __kmalloc_large_noprof(size, flags); index = kmalloc_index(size); return __kmalloc_cache_noprof( kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index], flags, size); } return __kmalloc_noprof(size, flags); } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) #define kmem_buckets_alloc_track_caller(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE, _RET_IP_)) static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node) { if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) return __kmalloc_large_node_noprof(size, flags, node); index = kmalloc_index(size); return __kmalloc_cache_node_noprof( kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index], flags, node, size); } return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node); } #define kmalloc_node(...) alloc_hooks(kmalloc_node_noprof(__VA_ARGS__)) /** * kmalloc_array - allocate memory for an array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) return kmalloc_noprof(bytes, flags); return kmalloc_noprof(bytes, flags); } #define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) /** * krealloc_array - reallocate memory for an array. * @p: pointer to the memory chunk to reallocate * @new_n: new number of elements to alloc * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * * See krealloc_noprof() for further details. * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. */ static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, size_t new_n, size_t new_size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) return NULL; return krealloc_noprof(p, bytes, flags); } #define krealloc_array(...) alloc_hooks(krealloc_array_noprof(__VA_ARGS__)) /** * kcalloc - allocate memory for an array. The memory is set to zero. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ #define kcalloc(n, size, flags) kmalloc_array(n, size, (flags) | __GFP_ZERO) void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller_noprof(size, flags, node, caller) \ __kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node, caller) #define kmalloc_node_track_caller(...) \ alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_)) /* * kmalloc_track_caller is a special version of kmalloc that records the * calling function of the routine calling it for slab leak tracking instead * of just the calling function (confusing, eh?). * It's useful when the call to kmalloc comes from a widely-used standard * allocator where we care about the real place the memory allocation * request comes from. */ #define kmalloc_track_caller(...) kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE) #define kmalloc_track_caller_noprof(...) \ kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_) static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) return kmalloc_node_noprof(bytes, flags, node); return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(bytes, NULL), flags, node); } #define kmalloc_array_node(...) alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__)) #define kcalloc_node(_n, _size, _flags, _node) \ kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node) /* * Shortcuts */ #define kmem_cache_zalloc(_k, _flags) kmem_cache_alloc(_k, (_flags)|__GFP_ZERO) /** * kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) { return kmalloc_noprof(size, flags | __GFP_ZERO); } #define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) #define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1); #define kvmalloc_node_noprof(size, flags, node) \ __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node) #define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) #define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) #define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE) #define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node) #define kmem_buckets_valloc(_b, _size, _flags) \ alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) static inline __alloc_size(1, 2) void * kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return kvmalloc_node_noprof(bytes, flags, node); } #define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE) #define kvcalloc_node_noprof(_n,_s,_f,_node) kvmalloc_array_node_noprof(_n,_s,(_f)|__GFP_ZERO,_node) #define kvcalloc_noprof(...) kvcalloc_node_noprof(__VA_ARGS__, NUMA_NO_NODE) #define kvmalloc_array(...) alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__)) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) __realloc_size(2); #define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); /** * kmalloc_size_roundup - Report allocation bucket size for the given size * * @size: Number of bytes to round up from. * * This returns the number of bytes that would be available in a kmalloc() * allocation of @size bytes. For example, a 126 byte request would be * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly * for the general-purpose kmalloc()-based allocations, and is not for the * pre-sized kmem_cache_alloc()-based allocations.) * * Use this to kmalloc() the full bucket size ahead of time instead of using * ksize() to query the size after an allocation. */ size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); void __init kvfree_rcu_init(void); #endif /* _LINUX_SLAB_H */
271 263 16 207 231 264 5 250 5 4 4 4 4 279 254 3 9 9 4 1 5 5 9 4 1 9 8 9 7 1 5 5 2 11 250 7 1 5 5 5 263 250 250 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct sk_buff' memory handlers. * * Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Florian La Roche, <rzsfl@rz.uni-sb.de> */ #ifndef _LINUX_SKBUFF_H #define _LINUX_SKBUFF_H #include <linux/kernel.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/bvec.h> #include <linux/cache.h> #include <linux/rbtree.h> #include <linux/socket.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <asm/types.h> #include <linux/spinlock.h> #include <net/checksum.h> #include <linux/rcupdate.h> #include <linux/dma-mapping.h> #include <linux/netdev_features.h> #include <net/flow_dissector.h> #include <linux/in6.h> #include <linux/if_packet.h> #include <linux/llist.h> #include <linux/page_frag_cache.h> #include <net/flow.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_common.h> #endif #include <net/net_debug.h> #include <net/dropreason-core.h> #include <net/netmem.h> /** * DOC: skb checksums * * The interface for checksum offload between the stack and networking drivers * is as follows... * * IP checksum related features * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * * .. flat-table:: Checksum related device features * :widths: 1 10 * * * - %NETIF_F_HW_CSUM * - The driver (or its device) is able to compute one * IP (one's complement) checksum for any combination * of protocols or protocol layering. The checksum is * computed and set in a packet per the CHECKSUM_PARTIAL * interface (see below). * * * - %NETIF_F_IP_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_RXCSUM * - Driver (device) performs receive checksum offload. * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * Checksumming of received packets by device * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Indication of checksum verification is set in &sk_buff.ip_summed. * Possible values are: * * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * * %CHECKSUM_UNNECESSARY is applicable to following protocols: * * - TCP: IPv6 and IPv4. * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. * - GRE: only if the checksum is present in the header. * - SCTP: indicates the CRC in SCTP header has been validated. * - FCOE: indicates the CRC in FC frame has been validated. * * &sk_buff.csum_level indicates the number of consecutive checksums found in * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet * received directly from another Linux OS, e.g., a virtualized Linux kernel * on the same host, or it may be set in the input path in GRO or remote * checksum offload. For the purposes of checksum verification, the checksum * referred to by skb->csum_start + skb->csum_offset and any preceding * checksums in the packet are considered verified. Any checksums in the * packet that are after the checksum being offloaded are not considered to * be verified. * * Checksumming on transmit for non-GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. * Values are: * * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() * from &sk_buff.csum_start up to the end, and to record/write the checksum at * offset &sk_buff.csum_start + &sk_buff.csum_offset. * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * * When the stack requests checksum offload for a packet, the driver MUST * ensure that the checksum is set correctly. A driver can either offload the * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of * &sk_buff.csum_not_inet, see :ref:`crc`) * is called to resolve the checksum. * * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * - %CHECKSUM_COMPLETE * * Not used in checksum output. If a driver observes a packet with this value * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. * * .. _crc: * * Non-IP checksum (CRC) offloads * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * .. flat-table:: * :widths: 1 10 * * * - %NETIF_F_SCTP_CRC * - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack * will set csum_start and csum_offset accordingly, set ip_summed to * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload * must verify which offload is configured for a packet by testing the * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. * * * - %NETIF_F_FCOE_CRC * - This feature indicates that a device is capable of offloading the FCOE * CRC in a packet. To perform this offload the stack will set ip_summed * to %CHECKSUM_PARTIAL and set csum_start and csum_offset * accordingly. Note that there is no indication in the skbuff that the * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload * is configured for a packet, presumably by inspecting packet headers. * * Checksumming on output with GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 #define CHECKSUM_UNNECESSARY 1 #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 /* Maximum value in skb->csum_level */ #define SKB_MAX_CSUM_LEVEL 3 #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) /* For X bytes available in skb->head, what is the minimal * allocation needed, knowing struct skb_shared_info needs * to be aligned. */ #define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define SKB_MAX_ORDER(X, ORDER) \ SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X)) #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) /* return minimum truesize of one skb containing X bytes of data */ #define SKB_TRUESIZE(X) ((X) + \ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; struct skb_ext; struct ts_config; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; int physinif; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; /* after prerouting + nat detected: store original source * mac since neigh resolution overwrites it, only used while * skb is out in neigh layer. */ char neigh_header[8]; }; }; #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) /* Chain in tc_skb_ext will be used to share the tc chain with * ovs recirc_id. It will be set to the current chain by tc * and read by ovs to recirc_id. */ struct tc_skb_ext { union { u64 act_miss_cookie; __u32 chain; }; __u16 mru; __u16 zone; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; u8 act_miss:1; /* Set if act_miss_cookie is used */ u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ }; #endif struct sk_buff_head { /* These two members must be first to match sk_buff. */ struct_group_tagged(sk_buff_list, list, struct sk_buff *next; struct sk_buff *prev; ); __u32 qlen; spinlock_t lock; }; struct sk_buff; #ifndef CONFIG_MAX_SKB_FRAGS # define CONFIG_MAX_SKB_FRAGS 17 #endif #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ #define GSO_BY_FRAGS 0xFFFF typedef struct skb_frag { netmem_ref netmem; unsigned int len; unsigned int offset; } skb_frag_t; /** * skb_frag_size() - Returns the size of a skb fragment * @frag: skb fragment */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { return frag->len; } /** * skb_frag_size_set() - Sets the size of a skb fragment * @frag: skb fragment * @size: size of fragment */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { frag->len = size; } /** * skb_frag_size_add() - Increments the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { frag->len += delta; } /** * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to subtract */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { frag->len -= delta; } /** * skb_frag_must_loop - Test if %p is a high memory page * @p: fragment's page */ static inline bool skb_frag_must_loop(struct page *p) { #if defined(CONFIG_HIGHMEM) if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p)) return true; #endif return false; } /** * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on * @f_off: offset from start of f->netmem * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, * non-zero only on first page. * @p_len: (temp var) length in current page, * < PAGE_SIZE only on first and last page. * @copied: (temp var) length so far, excluding current p_len. * * A fragment can hold a compound page, in which case per-page * operations, notably kmap_atomic, must be called for each * regular page. */ #define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \ for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \ p_off = (f_off) & (PAGE_SIZE - 1), \ p_len = skb_frag_must_loop(p) ? \ min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \ copied = 0; \ copied < f_len; \ copied += p_len, p++, p_off = 0, \ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ /** * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration * since arbitrary point in time * @netdev_data: address/cookie of network device driver used as * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. * * hwtstamps can only be compared against other hwtstamps from * the same device. * * This structure is attached to packets as part of the * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { union { ktime_t hwtstamp; void *netdev_data; }; }; /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, /* generate hardware time stamp based on cycles if supported */ SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { /* use zcopy routines */ SKBFL_ZEROCOPY_ENABLE = BIT(0), /* This indicates at least one fragment might be overwritten * (as in vmsplice(), sendfile() ...) * If we need to compute a TX checksum, we'll need to copy * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), /* segment contains only zerocopy data and should not be * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), SKBFL_DONT_ORPHAN = BIT(3), /* page references are managed by the ubuf_info, so it's safe to * use frags only up until ubuf_info is released */ SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) struct ubuf_info_ops { void (*complete)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); /* has to be compatible with skb_zcopy_set() */ int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg); }; /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. * The zerocopy_success argument is true if zero copy transmit occurred, * false on data copy or out of memory error caused by data copy attempt. * The ctx field is used to track device context. * The desc field is used to track userspace buffer index. */ struct ubuf_info { const struct ubuf_info_ops *ops; refcount_t refcnt; u8 flags; }; struct ubuf_info_msgzc { struct ubuf_info ubuf; union { struct { unsigned long desc; void *ctx; }; struct { u32 id; u16 len; u16 zerocopy:1; u32 bytelen; }; }; struct mmpin { struct user_struct *user; unsigned int num_pg; } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) #define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); /* Preserve some data across TX submission and completion. * * Note, this state is stored in the driver. Extending the layout * might need some special care. */ struct xsk_tx_metadata_compl { __u64 *tx_timestamp; }; /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { __u8 flags; __u8 meta_len; __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; union { struct skb_shared_hwtstamps hwtstamps; struct xsk_tx_metadata_compl xsk_meta; }; unsigned int gso_type; u32 tskey; /* * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; union { struct { u32 xdp_frags_size; u32 xdp_frags_truesize; }; /* * Intermediate layers must ensure that destructor_arg * remains valid until skb destructor. */ void *destructor_arg; }; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; /** * DOC: dataref and headerless skbs * * Transport layers send out clones of payload skbs they hold for * retransmissions. To allow lower layers of the stack to prepend their headers * we split &skb_shared_info.dataref into two halves. * The lower 16 bits count the overall number of references. * The higher 16 bits indicate how many of the references are payload-only. * skb_header_cloned() checks if skb is allowed to add / write the headers. * * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr * (via __skb_header_release()). Any clone created from marked skb will get * &sk_buff.hdr_len populated with the available headroom. * If there's the only clone in existence it's able to modify the headroom * at will. The sequence of calls inside the transport layer is:: * * <alloc skb> * skb_reserve() * __skb_header_release() * skb_clone() * // send the clone down the stack * * This is not a very generic construct and it depends on the transport layers * doing the right thing. In practice there's usually only one payload-only skb. * Having multiple payload-only skbs with different lengths of hdr_len is not * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) enum { SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ }; enum { SKB_GSO_TCPV4 = 1 << 0, /* This indicates the skb is from an untrusted source. */ SKB_GSO_DODGY = 1 << 1, /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, SKB_GSO_GRE = 1 << 6, SKB_GSO_GRE_CSUM = 1 << 7, SKB_GSO_IPXIP4 = 1 << 8, SKB_GSO_IPXIP6 = 1 << 9, SKB_GSO_UDP_TUNNEL = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, SKB_GSO_PARTIAL = 1 << 12, SKB_GSO_TUNNEL_REMCSUM = 1 << 13, SKB_GSO_SCTP = 1 << 14, SKB_GSO_ESP = 1 << 15, SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 #define NET_SKBUFF_DATA_USES_OFFSET 1 #endif #ifdef NET_SKBUFF_DATA_USES_OFFSET typedef unsigned int sk_buff_data_t; #else typedef unsigned char *sk_buff_data_t; #endif enum skb_tstamp_type { SKB_CLOCK_REALTIME, SKB_CLOCK_MONOTONIC, SKB_CLOCK_TAI, __SKB_CLOCK_MAX = SKB_CLOCK_TAI, }; /** * DOC: Basic sk_buff geometry * * struct sk_buff itself is a metadata structure and does not hold any packet * data. All the data is held in associated buffers. * * &sk_buff.head points to the main "head" buffer. The head buffer is divided * into two parts: * * - data buffer, containing headers and sometimes payload; * this is the part of the skb operated on by the common helpers * such as skb_put() or skb_pull(); * - shared info (struct skb_shared_info) which holds an array of pointers * to read-only data in the (page, offset, length) format. * * Optionally &skb_shared_info.frag_list may point to another skb. * * Basic diagram may look like this:: * * --------------- * | sk_buff | * --------------- * ,--------------------------- + head * / ,----------------- + data * / / ,----------- + tail * | | | , + end * | | | | * v v v v * ----------------------------------------------- * | headroom | data | tailroom | skb_shared_info | * ----------------------------------------------- * + [page frag] * + [page frag] * + [page frag] * + [page frag] --------- * + frag_list --> | sk_buff | * --------- * */ /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @hdr_len: writable header length of cloned skb * @csum: Checksum (must include start/offset pair) * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority * @ignore_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs * @inner_protocol_type: whether the inner protocol is * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @pp_recycle: mark the packet for recycling instead of freeing (implies * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport * ports. * @sw_hash: indicates hash was computed in software stack * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @encapsulation: indicates the inner headers in the skbuff are valid * @encap_hdr_csum: software checksum is needed * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @csum_complete_sw: checksum was completed by software * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) * @unreadable: indicates that at least 1 of the fragments in this skb is * unreadable. * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @tstamp_type: When set, skb->tstamp has the * delivery_time clock base of skb->tstamp. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) * @inner_ipproto: (aka @inner_protocol) stores ipproto when * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header * @kcov_handle: KCOV remote handle for remote coverage collection * @tail: Tail pointer * @end: End pointer * @head: Head of buffer * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { union { struct { /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; union { struct net_device *dev; /* Some protocols might use this space to store information, * while device pointer would be NULL. * UDP receive path is one user. */ unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; struct llist_node ll_node; }; struct sock *sk; union { ktime_t tstamp; u64 skb_mstamp_ns; /* earliest departure time */ }; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); union { struct { unsigned long _skb_refdst; void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; #ifdef CONFIG_NET_SOCK_MSG unsigned long _sk_redir; #endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define CLONED_MASK (1 << 7) #else #define CLONED_MASK 1 #endif #define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; /* public: */ __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, pfmemalloc:1, pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; /* public: */ __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 dst_pending_confirm:1; __u8 ip_summed:2; __u8 ooo_okay:1; /* private: */ __u8 __mono_tc_offset[0]; /* public: */ __u8 tstamp_type:2; /* See skb_tstamp_type */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 inner_protocol_type:1; __u8 l4_hash:1; __u8 sw_hash:1; #ifdef CONFIG_WIRELESS __u8 wifi_acked_valid:1; __u8 wifi_acked:1; #endif __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif #if IS_ENABLED(CONFIG_IP_VS) __u8 ipvs_property:1; #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) __u8 nf_trace:1; #endif #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_NETFILTER_SKIP_EGRESS __u8 nf_skip_egress:1; #endif #ifdef CONFIG_SKB_DECRYPTED __u8 decrypted:1; #endif __u8 slow_gro:1; #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif __u8 unreadable:1; #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif u16 alloc_cpu; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; int skb_iif; __u32 hash; union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; unsigned int sender_cpu; }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 reserved_tailroom; }; union { __be16 inner_protocol; __u8 inner_ipproto; }; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; #ifdef CONFIG_KCOV u64 kcov_handle; #endif ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS /* only usable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_TYPE_MAX (7 << 5) #else #define PKT_TYPE_MAX 7 #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) /* if you move tc_at_ingress or tstamp_type * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD #define SKB_TSTAMP_TYPE_MASK (3 << 6) #define SKB_TSTAMP_TYPE_RSHIFT (6) #define TC_AT_INGRESS_MASK (1 << 5) #else #define SKB_TSTAMP_TYPE_MASK (3) #define TC_AT_INGRESS_MASK (1 << 2) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel */ #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 /** * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves * @skb: buffer */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) { return unlikely(skb->pfmemalloc); } /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken */ #define SKB_DST_NOREF 1UL #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) /** * skb_dst - returns skb dst_entry * @skb: buffer * * Returns: skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { /* If refdst was not refcounted, check we still are in a * rcu_read_lock section */ WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && !rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } /** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** * skb_dst_is_noref - Test if skb dst isn't refcounted * @skb: buffer */ static inline bool skb_dst_is_noref(const struct sk_buff *skb) { return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. */ static inline bool skb_pkt_type_ok(u32 ptype) { return ptype <= PACKET_OTHERHOST; } /** * skb_napi_id - Returns the skb's NAPI id * @skb: buffer */ static inline unsigned int skb_napi_id(const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL return skb->napi_id; #else return 0; #endif } static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) { #ifdef CONFIG_WIRELESS return skb->wifi_acked_valid; #else return 0; #endif } /** * skb_unref - decrement the skb's reference count * @skb: buffer * * Returns: true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) return false; if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1)) smp_rmb(); else if (likely(!refcount_dec_and_test(&skb->users))) return false; return true; } static inline bool skb_data_unref(const struct sk_buff *skb, struct skb_shared_info *shinfo) { int bias; if (!skb->cloned) return true; bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; if (atomic_read(&shinfo->dataref) == bias) smp_rmb(); else if (atomic_sub_return(bias, &shinfo->dataref)) return false; return true; } void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason); static inline void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { sk_skb_reason_drop(NULL, skb, reason); } /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason * @skb: buffer to free */ static inline void kfree_skb(struct sk_buff *skb) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } void skb_release_head_state(struct sk_buff *skb); void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); static inline void kfree_skb_list(struct sk_buff *segs) { kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); } #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else static inline void consume_skb(struct sk_buff *skb) { return kfree_skb(skb); } #endif void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int max_page_order, int *errcode, gfp_t gfp_mask); struct sk_buff *alloc_skb_for_msg(struct sk_buff *first); /* Layout of fast clones : [skb1][skb2][fclone_ref] */ struct sk_buff_fclones { struct sk_buff skb1; struct sk_buff skb2; refcount_t fclone_ref; }; /** * skb_fclone_busy - check if fclone is busy * @sk: socket * @skb: buffer * * Returns: true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ static inline bool skb_fclone_busy(const struct sock *sk, const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && READ_ONCE(fclones->skb2.sk) == sk; } /** * alloc_skb_fclone - allocate a network buffer from fclone cache * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority); struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone); static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, headroom, gfp_mask, false); } int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); /** * skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error. */ static inline int skb_pad(struct sk_buff *skb, int pad) { return __skb_pad(skb, pad, true); } #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; __u32 frag_idx; __u32 stepped_offset; struct sk_buff *root_skb; struct sk_buff *cur_skb; __u8 *frag_data; __u32 frag_off; }; void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st); unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); /* * Packet hash types specify the type of hash in skb_set_hash. * * Hash types refer to the protocol layer addresses which are used to * construct a packet's hash. The hashes are used to differentiate or identify * flows of the protocol layer for the hash type. Hash types are either * layer-2 (L2), layer-3 (L3), or layer-4 (L4). * * Properties of hashes: * * 1) Two packets in different flows have different hash values * 2) Two packets in the same flow should have the same hash value * * A hash at a higher layer is considered to be more specific. A driver should * set the most specific hash possible. * * A driver cannot indicate a more specific hash than the layer at which a hash * was computed. For instance an L3 hash cannot be set as an L4 hash. * * A driver may indicate a hash level which is less specific than the * actual layer the hash was computed on. For instance, a hash computed * at L4 may be considered an L3 hash. This should only be done if the * driver can't unambiguously determine that the HW computed the hash at * the higher layer. Note that the "should" in the second property above * permits this. */ enum pkt_hash_types { PKT_HASH_TYPE_NONE, /* Undefined type */ PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; static inline void skb_clear_hash(struct sk_buff *skb) { skb->hash = 0; skb->sw_hash = 0; skb->l4_hash = 0; } static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) { if (!skb->l4_hash) skb_clear_hash(skb); } static inline void __skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) { skb->l4_hash = is_l4; skb->sw_hash = is_sw; skb->hash = hash; } static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) { /* Used by drivers to set hash from HW */ __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); } static inline void __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) { __skb_set_hash(skb, hash, true, is_l4); } u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb); static inline u32 __skb_get_hash_symmetric(const struct sk_buff *skb) { return __skb_get_hash_symmetric_net(NULL, skb); } void __skb_get_hash_net(const struct net *net, struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) { return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); struct bpf_flow_dissector; u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { return __skb_flow_dissect(NULL, skb, flow_dissector, target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, struct flow_keys *flow, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, flow, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, struct flow_keys_basic *flow, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); /* Gets a skb connection tracking info, ctinfo map should be a * map of mapsize to translate enum ip_conntrack_info states * to user states. */ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); static inline __u32 skb_get_hash_net(const struct net *net, struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash_net(net, skb); return skb->hash; } static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash_net(NULL, skb); return skb->hash; } static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; __u32 hash = __get_hash_from_flowi6(fl6, &keys); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; } __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; } static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; to->sw_hash = from->sw_hash; to->l4_hash = from->l4_hash; }; static inline int skb_cmp_decrypted(const struct sk_buff *skb1, const struct sk_buff *skb2) { #ifdef CONFIG_SKB_DECRYPTED return skb2->decrypted - skb1->decrypted; #else return 0; #endif } static inline bool skb_is_decrypted(const struct sk_buff *skb) { #ifdef CONFIG_SKB_DECRYPTED return skb->decrypted; #else return false; #endif } static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { #ifdef CONFIG_SKB_DECRYPTED to->decrypted = from->decrypted; #endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->head + skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = offset; } #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = skb->head + offset; } #endif extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) { return &skb_shinfo(skb)->hwtstamps; } static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) { bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL; } static inline bool skb_zcopy_pure(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } static inline bool skb_zcopy_managed(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; } static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); } static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); } static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg) { skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->flags |= uarg->flags; } static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { if (unlikely(have_ref && *have_ref)) *have_ref = false; else net_zcopy_get(uarg); skb_zcopy_init(skb, uarg); } } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) { return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; } static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) { return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) uarg->ops->complete(NULL, uarg, true); } static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { if (uarg->ops == &msg_zerocopy_ubuf_ops) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) net_zcopy_put(uarg); } } /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { if (!skb_zcopy_is_nouarg(skb)) uarg->ops->complete(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } void __skb_zcopy_downgrade_managed(struct sk_buff *skb); static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) { if (unlikely(skb_zcopy_managed(skb))) __skb_zcopy_downgrade_managed(skb); } /* Return true if frags in this skb are readable by the host. */ static inline bool skb_frags_readable(const struct sk_buff *skb) { return !skb->unreadable; } static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; } static inline void skb_poison_list(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET skb->next = SKB_LIST_POISON_NEXT; #endif } /* Iterate through singly-linked GSO fragments of an skb. */ #define skb_list_walk_safe(first, skb, next_skb) \ for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) static inline void skb_list_del_init(struct sk_buff *skb) { __list_del_entry(&skb->list); skb_mark_not_on_list(skb); } /** * skb_queue_empty - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. */ static inline int skb_queue_empty(const struct sk_buff_head *list) { return list->next == (const struct sk_buff *) list; } /** * skb_queue_empty_lockless - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. * This variant can be used in lockless contexts. */ static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list) { return READ_ONCE(list->next) == (const struct sk_buff *) list; } /** * skb_queue_is_last - check if skb is the last entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the last buffer on the list. */ static inline bool skb_queue_is_last(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->next == (const struct sk_buff *) list; } /** * skb_queue_is_first - check if skb is the first entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the first buffer on the list. */ static inline bool skb_queue_is_first(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->prev == (const struct sk_buff *) list; } /** * skb_queue_next - return the next packet in the queue * @list: queue head * @skb: current buffer * * Return the next packet in @list after @skb. It is only valid to * call this if skb_queue_is_last() evaluates to false. */ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_last(list, skb)); return skb->next; } /** * skb_queue_prev - return the prev packet in the queue * @list: queue head * @skb: current buffer * * Return the prev packet in @list before @skb. It is only valid to * call this if skb_queue_is_first() evaluates to false. */ static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_first(list, skb)); return skb->prev; } /** * skb_get - reference buffer * @skb: buffer to reference * * Makes another reference to a socket buffer and returns a pointer * to the buffer. */ static inline struct sk_buff *skb_get(struct sk_buff *skb) { refcount_inc(&skb->users); return skb; } /* * If users == 1, we are the only owner and can avoid redundant atomic changes. */ /** * skb_cloned - is the buffer a clone * @skb: buffer to check * * Returns true if the buffer was generated with skb_clone() and is * one of multiple shared copies of the buffer. Cloned buffers are * shared data so must not be written to under normal circumstances. */ static inline int skb_cloned(const struct sk_buff *skb) { return skb->cloned && (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /* This variant of skb_unclone() makes sure skb->truesize * and skb_end_offset() are not changed, whenever a new skb->head is needed. * * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) * when various debugging features are in place. */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return __skb_unclone_keeptruesize(skb, pri); return 0; } /** * skb_header_cloned - is the header a clone * @skb: buffer to check * * Returns true if modifying the header part of the buffer requires * the data to be copied. */ static inline int skb_header_cloned(const struct sk_buff *skb) { int dataref; if (!skb->cloned) return 0; dataref = atomic_read(&skb_shinfo(skb)->dataref); dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT); return dataref != 1; } static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_header_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /** * __skb_header_release() - allow clones to use the headroom * @skb: buffer to operate on * * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { skb->nohdr = 1; atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); } /** * skb_shared - is the buffer shared * @skb: buffer to check * * Returns true if more than one person has a reference to this * buffer. */ static inline int skb_shared(const struct sk_buff *skb) { return refcount_read(&skb->users) != 1; } /** * skb_share_check - check if buffer is shared and if so clone it * @skb: buffer to check * @pri: priority for memory allocation * * If the buffer is shared the buffer is cloned and the old copy * drops a reference. A new clone with a single reference is returned. * If the buffer is not shared the original buffer is returned. When * being called from interrupt status or with spinlocks held pri must * be GFP_ATOMIC. * * NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /* * Copy shared buffers into a new sk_buff. We effectively do COW on * packets to handle cases where we have a local reader and forward * and a couple of other messy ones. The normal one is tcpdumping * a packet that's being forwarded. */ /** * skb_unshare - make a copy of a shared buffer * @skb: buffer to check * @pri: priority for memory allocation * * If the socket buffer is a clone then this function creates a new * copy of the data, drops a reference count on the old copy and returns * the new copy with the reference count at 1. If the buffer is not a clone * the original buffer is returned. When called with a spinlock held or * from interrupt state @pri must be %GFP_ATOMIC * * %NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); /* Free our shared copy */ if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /** * skb_peek - peek at the head of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the head element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_) { struct sk_buff *skb = list_->next; if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * __skb_peek - peek at the head of a non-empty &sk_buff_head * @list_: list to peek at * * Like skb_peek(), but the caller knows that the list is not empty. */ static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_) { return list_->next; } /** * skb_peek_next - peek skb following the given one from a queue * @skb: skb to start from * @list_: list to peek at * * Returns %NULL when the end of the list is met or a pointer to the * next element. The reference count is not incremented and the * reference is therefore volatile. Use with caution. */ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, const struct sk_buff_head *list_) { struct sk_buff *next = skb->next; if (next == (struct sk_buff *)list_) next = NULL; return next; } /** * skb_peek_tail - peek at the tail of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the tail element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * skb_queue_len - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. */ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) { return list_->qlen; } /** * skb_queue_len_lockless - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. * This variant can be used in lockless contexts. */ static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) { return READ_ONCE(list_->qlen); } /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize * * This initializes only the list and queue length aspects of * an sk_buff_head object. This allows to initialize the list * aspects of an sk_buff_head without reinitializing things like * the spinlock. It can also be used for on-stack sk_buff_head * objects where the spinlock is known to not be used. */ static inline void __skb_queue_head_init(struct sk_buff_head *list) { list->prev = list->next = (struct sk_buff *)list; list->qlen = 0; } /* * This function creates a split out lock class for each invocation; * this is needed for now since a whole lot of users of the skb-queue * infrastructure in drivers have different locking usage (in hardirq) * than the networking core (in softirq only). In the long run either the * network layer or drivers should need annotation to consolidate the * main types of usage into 3 classes. */ static inline void skb_queue_head_init(struct sk_buff_head *list) { spin_lock_init(&list->lock); __skb_queue_head_init(list); } static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { skb_queue_head_init(list); lockdep_set_class(&list->lock, class); } /* * Insert an sk_buff on a list. * * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { /* See skb_queue_empty_lockless() and skb_peek_tail() * for the opposite READ_ONCE() */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *next) { struct sk_buff *first = list->next; struct sk_buff *last = list->prev; WRITE_ONCE(first->prev, prev); WRITE_ONCE(prev->next, first); WRITE_ONCE(last->next, next); WRITE_ONCE(next->prev, last); } /** * skb_queue_splice - join two skb lists, this is designed for stacks * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; } } /** * skb_queue_splice_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * The list at @list is reinitialised */ static inline void skb_queue_splice_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * skb_queue_splice_tail - join two skb lists, each list being a queue * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice_tail(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; } } /** * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * __skb_queue_after - queue a buffer at the list head * @list: list to use * @prev: place after this buffer * @newsk: buffer to queue * * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** * __skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_after(list, (struct sk_buff *)list, newsk); } void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); /** * __skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the end of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_before(list, (struct sk_buff *)list, newsk); } void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; WRITE_ONCE(next->prev, prev); WRITE_ONCE(prev->next, next); } /** * __skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. This function does not take any locks * so must be used with appropriate locks held only. The head item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue(struct sk_buff_head *list); /** * __skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. This function does not take any locks * so must be used with appropriate locks held only. The tail item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek_tail(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) { return skb->data_len; } static inline unsigned int skb_headlen(const struct sk_buff *skb) { return skb->len - skb->data_len; } static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--) len += skb_frag_size(&skb_shinfo(skb)->frags[i]); return len; } static inline unsigned int skb_pagelen(const struct sk_buff *skb) { return skb_headlen(skb) + __skb_pagelen(skb); } static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag, netmem_ref netmem, int off, int size) { frag->netmem = netmem; frag->offset = off; skb_frag_size_set(frag, size); } static inline void skb_frag_fill_page_desc(skb_frag_t *frag, struct page *page, int off, int size) { skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size); } static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo, int i, netmem_ref netmem, int off, int size) { skb_frag_t *frag = &shinfo->frags[i]; skb_frag_fill_netmem_desc(frag, netmem, off, size); } static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, int i, struct page *page, int off, int size) { __skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off, size); } /** * skb_len_add - adds a number to len fields of skb * @skb: buffer to add len to * @delta: number of bytes to add */ static inline void skb_len_add(struct sk_buff *skb, int delta) { skb->len += delta; skb->data_len += delta; skb->truesize += delta; } /** * __skb_fill_netmem_desc - initialise a fragment in an skb * @skb: buffer containing fragment to be initialised * @i: fragment index to initialise * @netmem: the netmem to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Initialises the @i'th fragment of @skb to point to &size bytes at * offset @off within @page. * * Does not take any additional reference on the fragment. */ static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size) { struct page *page; __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size); if (netmem_is_net_iov(netmem)) { skb->unreadable = true; return; } page = netmem_to_page(netmem); /* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size); } static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size) { __skb_fill_netmem_desc(skb, i, netmem, off, size); skb_shinfo(skb)->nr_frags = i + 1; } /** * skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * As per __skb_fill_page_desc() -- initialises the @i'th fragment of * @skb to point to @size bytes at offset @off within @page. In * addition updates @skb such that @i is the last fragment. * * Does not take any additional reference on the fragment. */ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size); } /** * skb_fill_page_desc_noacc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Variant of skb_fill_page_desc() which does not deal with * pfmemalloc, if page is not owned by us. */ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, struct page *page, int off, int size) { struct skb_shared_info *shinfo = skb_shinfo(skb); __skb_fill_page_desc_noacc(shinfo, i, page, off, size); shinfo->nr_frags = i + 1; } void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize); static inline void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size, truesize); } void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize); #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->head + skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data - skb->head; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb_reset_tail_pointer(skb); skb->tail += offset; } #else /* NET_SKBUFF_DATA_USES_OFFSET */ static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb->tail = skb->data + offset; } #endif /* NET_SKBUFF_DATA_USES_OFFSET */ static inline void skb_assert_len(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET if (WARN_ONCE(!skb->len, "%s\n", __func__)) DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); #endif /* CONFIG_DEBUG_NET */ } #if defined(CONFIG_FAIL_SKB_REALLOC) void skb_might_realloc(struct sk_buff *skb); #else static inline void skb_might_realloc(struct sk_buff *skb) {} #endif /* * Add data to an sk_buff */ void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); void *skb_put(struct sk_buff *skb, unsigned int len); static inline void *__skb_put(struct sk_buff *skb, unsigned int len) { void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; return tmp; } static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = __skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *__skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = __skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void __skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)__skb_put(skb, 1) = val; } static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)skb_put(skb, 1) = val; } void *skb_push(struct sk_buff *skb, unsigned int len); static inline void *__skb_push(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); skb->data -= len; skb->len += len; return skb->data; } void *skb_pull(struct sk_buff *skb, unsigned int len); static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); skb->len -= len; if (unlikely(skb->len < skb->data_len)) { #if defined(CONFIG_DEBUG_NET) skb->len += len; pr_err("__skb_pull(len=%u)\n", len); skb_dump(KERN_ERR, skb, false); #endif BUG(); } return skb->data += len; } static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len) { return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); } void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); skb_might_realloc(skb); if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; if (unlikely(len > skb->len)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb)))) return SKB_DROP_REASON_NOMEM; return SKB_NOT_DROPPED_YET; } static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) { if (!pskb_may_pull(skb, len)) return NULL; skb->len -= len; return skb->data += len; } void skb_condense(struct sk_buff *skb); /** * skb_headroom - bytes at buffer head * @skb: buffer to check * * Return the number of bytes of free space at the head of an &sk_buff. */ static inline unsigned int skb_headroom(const struct sk_buff *skb) { return skb->data - skb->head; } /** * skb_tailroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff */ static inline int skb_tailroom(const struct sk_buff *skb) { return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail; } /** * skb_availroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff * allocated by sk_stream_alloc() */ static inline int skb_availroom(const struct sk_buff *skb) { if (skb_is_nonlinear(skb)) return 0; return skb->end - skb->tail - skb->reserved_tailroom; } /** * skb_reserve - adjust headroom * @skb: buffer to alter * @len: bytes to move * * Increase the headroom of an empty &sk_buff by reducing the tail * room. This is only allowed for an empty buffer. */ static inline void skb_reserve(struct sk_buff *skb, int len) { skb->data += len; skb->tail += len; } /** * skb_tailroom_reserve - adjust reserved_tailroom * @skb: buffer to alter * @mtu: maximum amount of headlen permitted * @needed_tailroom: minimum amount of reserved_tailroom * * Set reserved_tailroom so that headlen can be as large as possible but * not larger than mtu and tailroom cannot be smaller than * needed_tailroom. * The required headroom should already have been reserved before using * this function. */ static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, unsigned int needed_tailroom) { SKB_LINEAR_ASSERT(skb); if (mtu < skb_tailroom(skb) - needed_tailroom) /* use at most mtu */ skb->reserved_tailroom = skb_tailroom(skb) - mtu; else /* use up to all available space */ skb->reserved_tailroom = needed_tailroom; } #define ENCAP_TYPE_ETHER 0 #define ENCAP_TYPE_IPPROTO 1 static inline void skb_set_inner_protocol(struct sk_buff *skb, __be16 protocol) { skb->inner_protocol = protocol; skb->inner_protocol_type = ENCAP_TYPE_ETHER; } static inline void skb_set_inner_ipproto(struct sk_buff *skb, __u8 ipproto) { skb->inner_ipproto = ipproto; skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; } static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; skb->inner_network_header = skb->network_header; skb->inner_transport_header = skb->transport_header; } static inline int skb_mac_header_was_set(const struct sk_buff *skb) { return skb->mac_header != (typeof(skb->mac_header))~0U; } static inline void skb_reset_mac_len(struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) { DEBUG_NET_WARN_ON_ONCE(1); skb->mac_len = 0; } else { skb->mac_len = skb->network_header - skb->mac_header; } } static inline unsigned char *skb_inner_transport_header(const struct sk_buff *skb) { return skb->head + skb->inner_transport_header; } static inline int skb_inner_transport_offset(const struct sk_buff *skb) { return skb_inner_transport_header(skb) - skb->data; } static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { long offset = skb->data - skb->head; DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_transport_header))offset); skb->inner_transport_header = offset; } static inline void skb_set_inner_transport_header(struct sk_buff *skb, const int offset) { skb_reset_inner_transport_header(skb); skb->inner_transport_header += offset; } static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb) { return skb->head + skb->inner_network_header; } static inline void skb_reset_inner_network_header(struct sk_buff *skb) { long offset = skb->data - skb->head; DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_network_header))offset); skb->inner_network_header = offset; } static inline void skb_set_inner_network_header(struct sk_buff *skb, const int offset) { skb_reset_inner_network_header(skb); skb->inner_network_header += offset; } static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb) { return skb->inner_network_header > 0; } static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) { return skb->head + skb->inner_mac_header; } static inline void skb_reset_inner_mac_header(struct sk_buff *skb) { long offset = skb->data - skb->head; DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_mac_header))offset); skb->inner_mac_header = offset; } static inline void skb_set_inner_mac_header(struct sk_buff *skb, const int offset) { skb_reset_inner_mac_header(skb); skb->inner_mac_header += offset; } static inline bool skb_transport_header_was_set(const struct sk_buff *skb) { return skb->transport_header != (typeof(skb->transport_header))~0U; } static inline unsigned char *skb_transport_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->head + skb->transport_header; } static inline void skb_reset_transport_header(struct sk_buff *skb) { long offset = skb->data - skb->head; DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->transport_header))offset); skb->transport_header = offset; } static inline void skb_set_transport_header(struct sk_buff *skb, const int offset) { skb_reset_transport_header(skb); skb->transport_header += offset; } static inline unsigned char *skb_network_header(const struct sk_buff *skb) { return skb->head + skb->network_header; } static inline void skb_reset_network_header(struct sk_buff *skb) { long offset = skb->data - skb->head; DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->network_header))offset); skb->network_header = offset; } static inline void skb_set_network_header(struct sk_buff *skb, const int offset) { skb_reset_network_header(skb); skb->network_header += offset; } static inline unsigned char *skb_mac_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->head + skb->mac_header; } static inline int skb_mac_offset(const struct sk_buff *skb) { return skb_mac_header(skb) - skb->data; } static inline u32 skb_mac_header_len(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->network_header - skb->mac_header; } static inline void skb_unset_mac_header(struct sk_buff *skb) { skb->mac_header = (typeof(skb->mac_header))~0U; } static inline void skb_reset_mac_header(struct sk_buff *skb) { long offset = skb->data - skb->head; DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->mac_header))offset); skb->mac_header = offset; } static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) { skb_reset_mac_header(skb); skb->mac_header += offset; } static inline void skb_pop_mac_header(struct sk_buff *skb) { skb->mac_header = skb->network_header; } static inline void skb_probe_transport_header(struct sk_buff *skb) { struct flow_keys_basic keys; if (skb_transport_header_was_set(skb)) return; if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); } static inline void skb_mac_header_rebuild(struct sk_buff *skb) { if (skb_mac_header_was_set(skb)) { const unsigned char *old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); } } /* Move the full mac header up to current network_header. * Leaves skb->data pointing at offset skb->mac_len into the mac_header. * Must be provided the complete mac header length. */ static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len) { if (skb_mac_header_was_set(skb)) { const unsigned char *old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -full_mac_len); memmove(skb_mac_header(skb), old_mac, full_mac_len); __skb_push(skb, full_mac_len - skb->mac_len); } } static inline int skb_checksum_start_offset(const struct sk_buff *skb) { return skb->csum_start - skb_headroom(skb); } static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) { return skb->head + skb->csum_start; } static inline int skb_transport_offset(const struct sk_buff *skb) { return skb_transport_header(skb) - skb->data; } static inline u32 skb_network_header_len(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->transport_header - skb->network_header; } static inline u32 skb_inner_network_header_len(const struct sk_buff *skb) { return skb->inner_transport_header - skb->inner_network_header; } static inline int skb_network_offset(const struct sk_buff *skb) { return skb_network_header(skb) - skb->data; } static inline int skb_inner_network_offset(const struct sk_buff *skb) { return skb_inner_network_header(skb) - skb->data; } static inline enum skb_drop_reason pskb_network_may_pull_reason(struct sk_buff *skb, unsigned int len) { return pskb_may_pull_reason(skb, skb_network_offset(skb) + len); } static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_network_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the * hardware handles it or large if we have to take an exception and fix it * in software. * * Since an ethernet header is 14 bytes network drivers often end up with * the IP header at an unaligned offset. The IP header can be aligned by * shifting the start of the packet by 2 bytes. Drivers should do this * with: * * skb_reserve(skb, NET_IP_ALIGN); * * The downside to this alignment of the IP header is that the DMA is now * unaligned. On some architectures the cost of an unaligned DMA is high * and this cost outweighs the gains made by aligning the IP header. * * Since this trade off varies between architectures, we allow NET_IP_ALIGN * to be overridden. */ #ifndef NET_IP_ALIGN #define NET_IP_ALIGN 2 #endif /* * The networking layer reserves some headroom in skb data (via * dev_alloc_skb). This is used to avoid having to reallocate skb data when * the header has to grow. In the default case, if the header has to grow * 32 bytes or less we avoid the reallocation. * * Unfortunately this headroom changes the DMA alignment of the resulting * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive * on some architectures. An architecture can override this value, * perhaps setting it to a cacheline in size (since that will maintain * cacheline alignment of the DMA). It must be a power of 2. * * Various parts of the networking layer expect at least 32 bytes of * headroom, you should not reduce this. * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD #define NET_SKB_PAD max(32, L1_CACHE_BYTES) #endif int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (WARN_ON(skb_is_nonlinear(skb))) return; skb->len = len; skb_set_tail_pointer(skb, len); } static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { __skb_set_length(skb, len); } void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { if (skb->data_len) return ___pskb_trim(skb, len); __skb_trim(skb, len); return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { skb_might_realloc(skb); return (len < skb->len) ? __pskb_trim(skb, len) : 0; } /** * pskb_trim_unique - remove end from a paged unique (not cloned) buffer * @skb: buffer to alter * @len: new length * * This is identical to pskb_trim except that the caller knows that * the skb is not cloned so we should never get an error due to out- * of-memory. */ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) { int err = pskb_trim(skb, len); BUG_ON(err); } static inline int __skb_grow(struct sk_buff *skb, unsigned int len) { unsigned int diff = len - skb->len; if (skb_tailroom(skb) < diff) { int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), GFP_ATOMIC); if (ret) return ret; } __skb_set_length(skb, len); return 0; } /** * skb_orphan - orphan a buffer * @skb: buffer to orphan * * If a buffer currently has an owner then we call the owner's * destructor function and make the @skb unowned. The buffer continues * to exist but is no longer charged to its former owner. */ static inline void skb_orphan(struct sk_buff *skb) { if (skb->destructor) { skb->destructor(skb); skb->destructor = NULL; skb->sk = NULL; } else { BUG_ON(skb->sk); } } /** * skb_orphan_frags - orphan the frags contained in a buffer * @skb: buffer to orphan frags from * @gfp_mask: allocation mask for replacement pages * * For each frag in the SKB which needs a destructor (i.e. has an * owner) create a copy of that frag and release the original * page by calling the destructor. */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) return 0; return skb_copy_ubufs(skb, gfp_mask); } /* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } /** * __skb_queue_purge_reason - empty a list * @list: list to empty * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function does not take the * list lock and the caller must hold the relevant locks to use it. */ static inline void __skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason) { struct sk_buff *skb; while ((skb = __skb_dequeue(list)) != NULL) kfree_skb_reason(skb, reason); } static inline void __skb_queue_purge(struct sk_buff_head *list) { __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } void skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason); static inline void skb_queue_purge(struct sk_buff_head *list) { skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } unsigned int skb_rbtree_purge(struct rb_root *root); void skb_errqueue_purge(struct sk_buff_head *list); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size * * Allocates a frag from a page for receive buffer. * Uses GFP_ATOMIC allocations. */ static inline void *netdev_alloc_frag(unsigned int fragsz) { return __netdev_alloc_frag_align(fragsz, ~0u); } static inline void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __netdev_alloc_frag_align(fragsz, -align); } struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask); /** * netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @length: length to allocate * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. Although this function * allocates memory it can be called from an interrupt. */ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb(dev, length, GFP_ATOMIC); } /* legacy helper around __netdev_alloc_skb() */ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { return __netdev_alloc_skb(NULL, length, gfp_mask); } /* legacy helper around netdev_alloc_skb() */ static inline struct sk_buff *dev_alloc_skb(unsigned int length) { return netdev_alloc_skb(NULL, length); } static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length, gfp_t gfp) { struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); if (NET_IP_ALIGN && skb) skb_reserve(skb, NET_IP_ALIGN); return skb; } static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } static inline void skb_free_frag(void *addr) { page_frag_free(addr); } void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); static inline void *napi_alloc_frag(unsigned int fragsz) { return __napi_alloc_frag_align(fragsz, ~0u); } static inline void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __napi_alloc_frag_align(fragsz, -align); } struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length); void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason); /** * __dev_alloc_pages - allocate page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * @order: size of the allocation * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. * 1. This is for device Rx, therefore a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order); } #define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__)) /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). */ #define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order) /** * __dev_alloc_page - allocate a page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) { return __dev_alloc_pages_noprof(gfp_mask, 0); } #define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__)) /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). */ #define dev_alloc_page() dev_alloc_pages(0) /** * dev_page_is_reusable - check whether a page can be reused for network Rx * @page: the page to test * * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * * Returns: false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) { return likely(page_to_nid(page) == numa_mem_id() && !page_is_pfmemalloc(page)); } /** * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page * @page: The page that was allocated from skb_alloc_page * @skb: The skb that may need pfmemalloc set */ static inline void skb_propagate_pfmemalloc(const struct page *page, struct sk_buff *skb) { if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_frag_off() - Returns the offset of a skb fragment * @frag: the paged fragment */ static inline unsigned int skb_frag_off(const skb_frag_t *frag) { return frag->offset; } /** * skb_frag_off_add() - Increments the offset of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) { frag->offset += delta; } /** * skb_frag_off_set() - Sets the offset of a skb fragment * @frag: skb fragment * @offset: offset of fragment */ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) { frag->offset = offset; } /** * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment * @fragto: skb fragment where offset is set * @fragfrom: skb fragment offset is copied from */ static inline void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->offset = fragfrom->offset; } /* Return: true if the skb_frag contains a net_iov. */ static inline bool skb_frag_is_net_iov(const skb_frag_t *frag) { return netmem_is_net_iov(frag->netmem); } /** * skb_frag_net_iov - retrieve the net_iov referred to by fragment * @frag: the fragment * * Return: the &struct net_iov associated with @frag. Returns NULL if this * frag has no associated net_iov. */ static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag) { if (!skb_frag_is_net_iov(frag)) return NULL; return netmem_to_net_iov(frag->netmem); } /** * skb_frag_page - retrieve the page referred to by a paged fragment * @frag: the paged fragment * * Return: the &struct page associated with @frag. Returns NULL if this frag * has no associated page. */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { if (skb_frag_is_net_iov(frag)) return NULL; return netmem_to_page(frag->netmem); } /** * skb_frag_netmem - retrieve the netmem referred to by a fragment * @frag: the fragment * * Return: the &netmem_ref associated with @frag. */ static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) { return frag->netmem; } int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, const struct bpf_prog *prog); /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns: the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) { if (!skb_frag_page(frag)) return NULL; return page_address(skb_frag_page(frag)) + skb_frag_off(frag); } /** * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns: the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) { void *ptr = page_address(skb_frag_page(frag)); if (unlikely(!ptr)) return NULL; return ptr + skb_frag_off(frag); } /** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set * @fragfrom: skb fragment page is copied from */ static inline void skb_frag_page_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->netmem = fragfrom->netmem; } bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** * __skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the * fragment's own offset) * @size: the number of bytes to map * @dir: the direction of the mapping (``PCI_DMA_*``) * * Maps the page associated with @frag to @device. */ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, const skb_frag_t *frag, size_t offset, size_t size, enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } #define skb_frag_dma_map(dev, frag, ...) \ CONCATENATE(_skb_frag_dma_map, \ COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__) #define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({ \ const skb_frag_t *uf = (frag); \ size_t uo = (offset); \ \ __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo, \ DMA_TO_DEVICE); \ }) #define _skb_frag_dma_map1(dev, frag, offset) \ __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_), \ __UNIQUE_ID(offset_)) #define _skb_frag_dma_map0(dev, frag) \ _skb_frag_dma_map1(dev, frag, 0) #define _skb_frag_dma_map2(dev, frag, offset, size) \ __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) #define _skb_frag_dma_map3(dev, frag, offset, size, dir) \ __skb_frag_dma_map(dev, frag, offset, size, dir) static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy(skb, skb_headroom(skb), gfp_mask); } static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true); } /** * skb_clone_writable - is the header of a clone writable * @skb: buffer to check * @len: length up to which to write * * Returns true if modifying the header part of the cloned buffer * does not requires the data to be copied. */ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len) { return !skb_header_cloned(skb) && skb_headroom(skb) + len <= skb->hdr_len; } static inline int skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return skb_cloned(skb) && !skb_clone_writable(skb, write_len) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, int cloned) { int delta = 0; if (headroom > skb_headroom(skb)) delta = headroom - skb_headroom(skb); if (delta || cloned) return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0, GFP_ATOMIC); return 0; } /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow * @headroom: needed headroom * * If the skb passed lacks sufficient headroom or its data part * is shared, data is reallocated. If reallocation fails, an error * is returned and original skb is not changed. * * The result is skb with writable area skb->head...skb->tail * and at least @headroom of space at head. */ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_cloned(skb)); } /** * skb_cow_head - skb_cow but only making the head writable * @skb: buffer to cow * @headroom: needed headroom * * This function is identical to skb_cow except that we replace the * skb_cloned check by skb_header_cloned. It should be used when * you only need to push on some header and do not need to modify * the data. */ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_header_cloned(skb)); } /** * skb_padto - pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; if (likely(size >= len)) return 0; return skb_pad(skb, len - size); } /** * __skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * @free_on_error: free buffer on error * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ static inline int __must_check __skb_put_padto(struct sk_buff *skb, unsigned int len, bool free_on_error) { unsigned int size = skb->len; if (unlikely(size < len)) { len -= size; if (__skb_pad(skb, len, free_on_error)) return -ENOMEM; __skb_put(skb, len); } return 0; } /** * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) __must_check; static inline int skb_add_data(struct sk_buff *skb, struct iov_iter *from, int copy) { const int off = skb->len; if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy, &csum, from)) { skb->csum = csum_block_add(skb->csum, csum, off); return 0; } } else if (copy_from_iter_full(skb_put(skb, copy), copy, from)) return 0; __skb_trim(skb, off); return -EFAULT; } static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { if (skb_zcopy(skb)) return false; if (i) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; return page == skb_frag_page(frag) && off == skb_frag_off(frag) + skb_frag_size(frag); } return false; } static inline int __skb_linearize(struct sk_buff *skb) { return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM; } /** * skb_linearize - convert paged skb to linear one * @skb: buffer to linarize * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize(struct sk_buff *skb) { return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; } /** * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * * Return: true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) { return skb_is_nonlinear(skb) && skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } /** * skb_linearize_cow - make sure skb is linear and writable * @skb: buffer to process * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize_cow(struct sk_buff *skb) { return skb_is_nonlinear(skb) || skb_cloned(skb) ? __skb_linearize(skb) : 0; } static __always_inline void __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_sub(skb->csum, csum_partial(start, len, 0), off); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } /** * skb_postpull_rcsum - update checksum for received skb after pull * @skb: buffer to update * @start: start of data before pull * @len: length of data pulled * * After doing a pull on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum, or set ip_summed to * CHECKSUM_NONE so that it can be recomputed from scratch. */ static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = wsum_negate(csum_partial(start, len, wsum_negate(skb->csum))); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } static __always_inline void __skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_add(skb->csum, csum_partial(start, len, 0), off); } /** * skb_postpush_rcsum - update checksum for received skb after push * @skb: buffer to update * @start: start of data after push * @len: length of data pushed * * After doing a push on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum. */ static inline void skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { __skb_postpush_rcsum(skb, start, len, 0); } void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); /** * skb_push_rcsum - push skb and update receive checksum * @skb: buffer to update * @len: length of data pulled * * This function performs an skb_push on the packet and updates * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_push unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) { skb_push(skb, len); skb_postpush_rcsum(skb, skb->data, len); return skb->data; } int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim * @len: new length * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { skb_might_realloc(skb); if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); } static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; __skb_trim(skb, len); return 0; } static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; return __skb_grow(skb, len); } #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) #define skb_rb_first(root) rb_to_skb(rb_first(root)) #define skb_rb_last(root) rb_to_skb(rb_last(root)) #define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) #define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_queue_walk_safe(queue, skb, tmp) \ for (skb = (queue)->next, tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_walk_from(queue, skb) \ for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_rbtree_walk(skb, root) \ for (skb = skb_rb_first(root); skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from(skb) \ for (; skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from_safe(skb, tmp) \ for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ skb = tmp) #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_reverse_walk(queue, skb) \ for (skb = (queue)->prev; \ skb != (struct sk_buff *)(queue); \ skb = skb->prev) #define skb_queue_reverse_walk_safe(queue, skb, tmp) \ for (skb = (queue)->prev, tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) #define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) static inline bool skb_has_frag_list(const struct sk_buff *skb) { return skb_shinfo(skb)->frag_list != NULL; } static inline void skb_frag_list_init(struct sk_buff *skb) { skb_shinfo(skb)->frag_list = NULL; } #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, struct msghdr *msg, int size) { return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size); } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len); int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen); void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_eth_pop(struct sk_buff *skb); int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT; } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) { return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); }; extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, const void *data, int hlen, void *buffer) { if (likely(hlen - offset >= len)) return (void *)data + offset; if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) return NULL; return buffer; } static inline void * __must_check skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { return __skb_header_pointer(skb, offset, len, skb->data, skb_headlen(skb), buffer); } static inline void * __must_check skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len) { if (likely(skb_headlen(skb) - offset >= len)) return skb->data + offset; return NULL; } /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. * @skb: socket buffer to check * @features: net device features * * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or * 2. skb is fragmented and the device does not support SG. */ static inline bool skb_needs_linearize(struct sk_buff *skb, netdev_features_t features) { return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG))); } static inline void skb_copy_from_linear_data(const struct sk_buff *skb, void *to, const unsigned int len) { memcpy(to, skb->data, len); } static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb, const int offset, void *to, const unsigned int len) { memcpy(to, skb->data + offset, len); } static inline void skb_copy_to_linear_data(struct sk_buff *skb, const void *from, const unsigned int len) { memcpy(skb->data, from, len); } static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, const int offset, const void *from, const unsigned int len) { memcpy(skb->data + offset, from, len); } void skb_init(void); static inline ktime_t skb_get_ktime(const struct sk_buff *skb) { return skb->tstamp; } /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from * @stamp: pointer to struct __kernel_old_timeval to store stamp in * * Timestamps are stored in the skb as offsets to a base timestamp. * This function converts the offset back to a struct timeval and stores * it in stamp. */ static inline void skb_get_timestamp(const struct sk_buff *skb, struct __kernel_old_timeval *stamp) { *stamp = ns_to_kernel_old_timeval(skb->tstamp); } static inline void skb_get_new_timestamp(const struct sk_buff *skb, struct __kernel_sock_timeval *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_usec = ts.tv_nsec / 1000; } static inline void skb_get_timestampns(const struct sk_buff *skb, struct __kernel_old_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void skb_get_new_timestampns(const struct sk_buff *skb, struct __kernel_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); skb->tstamp_type = SKB_CLOCK_REALTIME; } static inline ktime_t net_timedelta(ktime_t t) { return ktime_sub(ktime_get_real(), t); } static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, u8 tstamp_type) { skb->tstamp = kt; if (kt) skb->tstamp_type = tstamp_type; else skb->tstamp_type = SKB_CLOCK_REALTIME; } static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb, ktime_t kt, clockid_t clockid) { u8 tstamp_type = SKB_CLOCK_REALTIME; switch (clockid) { case CLOCK_REALTIME: break; case CLOCK_MONOTONIC: tstamp_type = SKB_CLOCK_MONOTONIC; break; case CLOCK_TAI: tstamp_type = SKB_CLOCK_TAI; break; default: WARN_ON_ONCE(1); kt = 0; } skb_set_delivery_time(skb, kt, tstamp_type); } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); /* It is used in the ingress path to clear the delivery_time. * If needed, set the skb->tstamp to the (rcv) timestamp. */ static inline void skb_clear_delivery_time(struct sk_buff *skb) { if (skb->tstamp_type) { skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); else skb->tstamp = 0; } } static inline void skb_clear_tstamp(struct sk_buff *skb) { if (skb->tstamp_type) return; skb->tstamp = 0; } static inline ktime_t skb_tstamp(const struct sk_buff *skb) { if (skb->tstamp_type) return 0; return skb->tstamp; } static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) { if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp) return skb->tstamp; if (static_branch_unlikely(&netstamp_needed_key) || cond) return ktime_get_real(); return 0; } static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; } static inline void *skb_metadata_end(const struct sk_buff *skb) { return skb_mac_header(skb); } static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b, u8 meta_len) { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); u64 diffs = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || BITS_PER_LONG != 64) goto slow; /* Using more efficient variant than plain call to memcmp(). */ switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) case 32: diffs |= __it_diff(a, b, 64); fallthrough; case 24: diffs |= __it_diff(a, b, 64); fallthrough; case 16: diffs |= __it_diff(a, b, 64); fallthrough; case 8: diffs |= __it_diff(a, b, 64); break; case 28: diffs |= __it_diff(a, b, 64); fallthrough; case 20: diffs |= __it_diff(a, b, 64); fallthrough; case 12: diffs |= __it_diff(a, b, 64); fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; default: slow: return memcmp(a - meta_len, b - meta_len, meta_len); } return diffs; } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { u8 len_a = skb_metadata_len(skb_a); u8 len_b = skb_metadata_len(skb_b); if (!(len_a | len_b)) return false; return len_a != len_b ? true : __skb_metadata_differs(skb_a, skb_b, len_a); } static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { skb_shinfo(skb)->meta_len = meta_len; } static inline void skb_metadata_clear(struct sk_buff *skb) { skb_metadata_set(skb, 0); } struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); bool skb_defer_rx_timestamp(struct sk_buff *skb); #else /* CONFIG_NETWORK_PHY_TIMESTAMPING */ static inline void skb_clone_tx_timestamp(struct sk_buff *skb) { } static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) { return false; } #endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */ /** * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps * * PHY drivers may accept clones of transmitted packets for * timestamping via their phy_driver.txtstamp method. These drivers * must call this function to return the skb back to the stack with a * timestamp. * * @skb: clone of the original outgoing packet * @hwtstamps: hardware time stamps * */ void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype); /** * skb_tstamp_tx - queue clone of skb with send time stamps * @orig_skb: the original outgoing packet * @hwtstamps: hardware time stamps, may be NULL if not available * * If the skb has a socket associated, then this function clones the * skb (thus sharing the actual data and optional structures), stores * the optional hardware time stamping information (if non NULL) or * generates a software time stamp (otherwise), then queues the clone * to the error queue of the socket. Errors are silently ignored. */ void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps); /** * skb_tx_timestamp() - Driver hook for transmit timestamping * * Ethernet MAC Drivers should call this function in their hard_xmit() * function immediately before giving the sk_buff to the MAC hardware. * * Specifically, one should make absolutely sure that this function is * called before TX completion of this packet can trigger. Otherwise * the packet could potentially already be freed. * * @skb: A socket buffer. */ static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) skb_tstamp_tx(skb, NULL); } /** * skb_complete_wifi_ack - deliver skb with wifi status * * @skb: the original outgoing packet * @acked: ack status * */ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked); __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); __sum16 __skb_checksum_complete(struct sk_buff *skb); static inline int skb_csum_unnecessary(const struct sk_buff *skb) { return ((skb->ip_summed == CHECKSUM_UNNECESSARY) || skb->csum_valid || (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) >= 0)); } /** * skb_checksum_complete - Calculate checksum of an entire packet * @skb: packet to process * * This function calculates the checksum over the entire packet plus * the value of skb->csum. The latter can be used to supply the * checksum of a pseudo header as used by TCP/UDP. It returns the * checksum. * * For protocols that contain complete checksums such as ICMP/TCP/UDP, * this function can be used to verify that checksum on received * packets. In that case the function should return zero if the * checksum is correct. In particular, this function will return zero * if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the * hardware has already verified the correctness of the checksum. */ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) { return skb_csum_unnecessary(skb) ? 0 : __skb_checksum_complete(skb); } static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level == 0) skb->ip_summed = CHECKSUM_NONE; else skb->csum_level--; } } static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level < SKB_MAX_CSUM_LEVEL) skb->csum_level++; } else if (skb->ip_summed == CHECKSUM_NONE) { skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = 0; } } static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { skb->ip_summed = CHECKSUM_NONE; skb->csum_level = 0; } } /* Check if we need to perform checksum complete validation. * * Returns: true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; __skb_decr_checksum_unnecessary(skb); return false; } return true; } /* For small packets <= CHECKSUM_BREAK perform checksum complete directly * in checksum_init. */ #define CHECKSUM_BREAK 76 /* Unset checksum-complete * * Unset checksum complete can be done when packet is being modified * (uncompressed for instance) and checksum-complete value is * invalidated. */ static inline void skb_checksum_complete_unset(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /* Validate (init) checksum based on checksum complete. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete. In the latter * case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo * checksum is stored in skb->csum for use in __skb_checksum_complete * non-zero: value of invalid checksum * */ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, bool complete, __wsum psum) { if (skb->ip_summed == CHECKSUM_COMPLETE) { if (!csum_fold(csum_add(psum, skb->csum))) { skb->csum_valid = 1; return 0; } } skb->csum = psum; if (complete || skb->len <= CHECKSUM_BREAK) { __sum16 csum; csum = __skb_checksum_complete(skb); skb->csum_valid = !csum; return csum; } return 0; } static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) { return 0; } /* Perform checksum validate (init). Note that this is a macro since we only * want to calculate the pseudo header which is an input function if necessary. * First we try to validate without any computation (checksum unnecessary) and * then calculate based on checksum complete calling the function to compute * pseudo header. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete * non-zero: value of invalid checksum */ #define __skb_checksum_validate(skb, proto, complete, \ zero_okay, check, compute_pseudo) \ ({ \ __sum16 __ret = 0; \ skb->csum_valid = 0; \ if (__skb_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_checksum_validate_complete(skb, \ complete, compute_pseudo(skb, proto)); \ __ret; \ }) #define skb_checksum_init(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo) #define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo) #define skb_checksum_validate(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo) #define skb_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) static inline bool __skb_checksum_convert_check(struct sk_buff *skb) { return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid); } static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo) { skb->csum = ~pseudo; skb->ip_summed = CHECKSUM_COMPLETE; } #define skb_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_checksum_convert_check(skb)) \ __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \ } while (0) static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr, u16 start, u16 offset) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = ((unsigned char *)ptr + start) - skb->head; skb->csum_offset = offset - start; } /* Update skbuf and packet to reflect the remote checksum offload operation. * When called, ptr indicates the starting point for skb->csum when * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete * here, skb_postpull_rcsum is done so skb->csum start is ptr. */ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, int start, int offset, bool nopartial) { __wsum delta; if (!nopartial) { skb_remcsum_adjust_partial(skb, ptr, start, offset); return; } if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } delta = remcsum_adjust(ptr, skb->csum, start, offset); /* Adjust skb->csum since we changed the packet */ skb->csum = csum_add(skb->csum, delta); } static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return (void *)(skb->_nfct & NFCT_PTRMASK); #else return NULL; #endif } static inline unsigned long skb_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return skb->_nfct; #else return 0UL; #endif } static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) skb->slow_gro |= !!nfct; skb->_nfct = nfct; #endif } #ifdef CONFIG_SKB_EXTENSIONS enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, #endif #ifdef CONFIG_XFRM SKB_EXT_SEC_PATH, #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, #endif SKB_EXT_NUM, /* must be last */ }; /** * struct skb_ext - sk_buff extensions * @refcnt: 1 on allocation, deallocated on 0 * @offset: offset to add to @data to obtain extension address * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units * @data: start of extension data, variable sized * * Note: offsets/lengths are stored in chunks of 8 bytes, this allows * to use 'u8' types while allowing up to 2kb worth of extension data. */ struct skb_ext { refcount_t refcnt; u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ u8 chunks; /* same */ char data[] __aligned(8); }; struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); static inline void skb_ext_put(struct sk_buff *skb) { if (skb->active_extensions) __skb_ext_put(skb->extensions); } static inline void __skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { dst->active_extensions = src->active_extensions; if (src->active_extensions) { struct skb_ext *ext = src->extensions; refcount_inc(&ext->refcnt); dst->extensions = ext; } } static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { skb_ext_put(dst); __skb_ext_copy(dst, src); } static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) { return !!ext->offset[i]; } static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) { return skb->active_extensions & (1 << id); } static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) __skb_ext_del(skb, id); } static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) { struct skb_ext *ext = skb->extensions; return (void *)ext + (ext->offset[id] << 3); } return NULL; } static inline void skb_ext_reset(struct sk_buff *skb) { if (unlikely(skb->active_extensions)) { __skb_ext_put(skb->extensions); skb->active_extensions = 0; } } static inline bool skb_has_extensions(struct sk_buff *skb) { return unlikely(skb->active_extensions); } #else static inline void skb_ext_put(struct sk_buff *skb) {} static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} static inline bool skb_has_extensions(struct sk_buff *skb) { return false; } #endif /* CONFIG_SKB_EXTENSIONS */ static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif } static inline void nf_reset_trace(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } static inline void ipvs_reset(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_VS) skb->ipvs_property = 0; #endif } /* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif } static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); #endif dst->slow_gro = src->slow_gro; __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { to->secmark = from->secmark; } static inline void skb_init_secmark(struct sk_buff *skb) { skb->secmark = 0; } #else static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { } static inline void skb_init_secmark(struct sk_buff *skb) { } #endif static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif } static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); } static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { skb->queue_mapping = queue_mapping; } static inline u16 skb_get_queue_mapping(const struct sk_buff *skb) { return skb->queue_mapping; } static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from) { to->queue_mapping = from->queue_mapping; } static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) { skb->queue_mapping = rx_queue + 1; } static inline u16 skb_get_rx_queue(const struct sk_buff *skb) { return skb->queue_mapping - 1; } static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) { return skb->queue_mapping != 0; } static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val) { skb->dst_pending_confirm = val; } static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) { return skb->dst_pending_confirm != 0; } static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif } static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_v6(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_tcp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); } static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_segs = 0; skb_shinfo(skb)->gso_type = 0; } static inline void skb_increase_gso_size(struct skb_shared_info *shinfo, u16 increment) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size += increment; } static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, u16 decrement) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size -= decrement; } void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) { /* LRO sets gso_size but not gso_type, whereas if GSO is really * wanted then gso_type will be set. */ const struct skb_shared_info *shinfo = skb_shinfo(skb); if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) { __skb_warn_lro_forwarding(skb); return true; } return false; } static inline void skb_forward_csum(struct sk_buff *skb) { /* Unfortunately we don't support this one. Any brave souls? */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /** * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE * @skb: skb to check * * fresh skbs have their ip_summed set to CHECKSUM_NONE. * Instead of forcing ip_summed to CHECKSUM_NONE, we can * use this helper, to document places where we make this assertion. */ static inline void skb_checksum_none_assert(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE); } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)); /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check * * The head on skbs build around a head frag can be removed if they are * not cloned. This function returns true if the skb head is locked down * due to either being allocated via kmalloc, or by being a clone with * multiple references to the head. */ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. * See Documentation/networking/checksum-offloads.rst for * explanation of how this works. * Fill in outer checksum adjustment (e.g. with sum of outer * pseudo-header) before calling. * Also ensure that inner checksum is in linear data area. */ static inline __wsum lco_csum(struct sk_buff *skb) { unsigned char *csum_start = skb_checksum_start(skb); unsigned char *l4_hdr = skb_transport_header(skb); __wsum partial; /* Start with complement of inner checksum adjustment */ partial = ~csum_unfold(*(__force __sum16 *)(csum_start + skb->csum_offset)); /* Add in checksum of our headers (incl. outer checksum * adjustment filled in by caller) and return result. */ return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } static inline bool skb_is_redirected(const struct sk_buff *skb) { return skb->redirected; } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb_clear_tstamp(skb); #endif } static inline void skb_reset_redirect(struct sk_buff *skb) { skb->redirected = 0; } static inline void skb_set_redirected_noclear(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; #endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_SCTP) return skb->csum_not_inet; #else return 0; #endif } static inline void skb_reset_csum_not_inet(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; #if IS_ENABLED(CONFIG_IP_SCTP) skb->csum_not_inet = 0; #endif } static inline void skb_set_kcov_handle(struct sk_buff *skb, const u64 kcov_handle) { #ifdef CONFIG_KCOV skb->kcov_handle = kcov_handle; #endif } static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { #ifdef CONFIG_KCOV return skb->kcov_handle; #else return 0; #endif } static inline void skb_mark_for_recycle(struct sk_buff *skb) { #ifdef CONFIG_PAGE_POOL skb->pp_recycle = 1; #endif } ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp); #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */
40 39 40 38 39 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT | __GFP_NOWARN); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { char *buf; int size, used; }; #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ if (!_s->buf) { \ pr_err(fmt, ##args); \ } else { \ int ret, len = _s->size - _s->used; \ ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ _s->used += min(ret, len); \ } \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", dir->name, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", dir->name, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = {}; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT | __GFP_NOWARN); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free);
4 3 3 3 1 2 5 1 4 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* * llc_input.c - Minimal input path for LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /* * Packet handler for the station, registerable because in the minimal * LLC core that is taking shape only the very minimal subset of LLC that * is needed for things like IPX, Appletalk, etc will stay, with all the * rest in the llc1 and llc2 modules. */ static void (*llc_station_handler)(struct sk_buff *skb); /* * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. */ static void (*llc_type_handlers[2])(struct llc_sap *sap, struct sk_buff *skb); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { /* Ensure initialisation is complete before it's called */ if (handler) smp_wmb(); llc_station_handler = handler; if (!handler) synchronize_net(); } /** * llc_pdu_type - returns which LLC component must handle for PDU * @skb: input skb * * This function returns which LLC component must handle this PDU. */ static __inline__ int llc_pdu_type(struct sk_buff *skb) { int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) goto out; switch (LLC_U_PDU_CMD(pdu)) { case LLC_1_PDU_CMD_XID: case LLC_1_PDU_CMD_UI: case LLC_1_PDU_CMD_TEST: type = LLC_DEST_SAP; break; case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: break; default: type = LLC_DEST_INVALID; break; } out: return type; } /** * llc_fixup_skb - initializes skb pointers * @skb: This argument points to incoming skb * * Initializes internal skb pointer to start of network layer by deriving * length of LLC header; finds length of LLC control field in LLC header * by looking at the two lowest-order bits of the first control field * byte; field is either 3 or 4 bytes long. */ static inline int llc_fixup_skb(struct sk_buff *skb) { u8 llc_len = 2; struct llc_pdu_un *pdu; if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) return 0; pdu = (struct llc_pdu_un *)skb->data; if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) llc_len = 1; llc_len += 2; if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; skb_pull(skb, llc_len); skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; if (skb->mac_len < ETH_HLEN) return 0; pdulen = eth_hdr(skb)->h_proto; data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) return 0; if (unlikely(pskb_trim_rcsum(skb, data_size))) return 0; } return 1; } /** * llc_rcv - 802.2 entry point from net lower layers * @skb: received pdu * @dev: device that receive pdu * @pt: packet type * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to * llc_{station,sap,conn}_rcv for sending to proper state machine. If * the frame is related to a busy connection (a connection is sending * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { dprintk("%s: PACKET_OTHERHOST\n", __func__); goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; if (unlikely(!llc_fixup_skb(skb))) goto drop; pdu = llc_pdu_sn_hdr(skb); if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ goto handle_station; sap = llc_sap_find(pdu->dsap); if (unlikely(!sap)) {/* unknown SAP */ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, pdu->dsap); goto drop; } /* * First the upper layer protocols that don't need the full * LLC functionality */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else kfree_skb(skb); } else { if (rcv) { struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); } sap_handler(sap, skb); } llc_sap_put(sap); out: return 0; drop: kfree_skb(skb); goto out; handle_station: sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); goto out; } EXPORT_SYMBOL(llc_add_pack); EXPORT_SYMBOL(llc_remove_pack); EXPORT_SYMBOL(llc_set_station_handler);
5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { struct vlan_hdr vhdr, *vh; unsigned int header_len; if (!dev) return 0; /* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size. */ if (dev->min_header_len == dev->hard_header_len) header_len = dev->hard_header_len; else if (is_vlan_dev(dev)) header_len = dev->min_header_len; else return 0; vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; return ntohs(vh->h_vlan_TCI); } static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) proto = __vlan_get_protocol_offset(skb, proto, skb_mac_offset(skb), NULL); return proto; } static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { del_timer_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; spin_unlock_bh(&rb_queue->lock); prb_del_retire_blk_timer(pkc); } static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 0); pkc->retire_blk_timer.expires = jiffies; } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } /* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held. */ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. * */ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { struct packet_sock *po = from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); if (unlikely(pkc->delete_blk_timer)) goto out; /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer; } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. */ goto refresh_timer; } else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); goto out; } } } refresh_timer: _prb_refresh_rx_retire_blk_timer(pkc); out: spin_unlock(&po->sk.sk_receive_queue.lock); } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * Side effect of opening a block: * * 1) prb_queue is thawed. * 2) retire_blk_timer is refreshed. * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __dev_remove_pack(&po->prot_hook); __fanout_link(sk, po); } err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); h.h2->tp_vlan_tpid = ntohs(skb->protocol); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo = 0; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { if (need_wait && skb) { timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } } /* check for additional frames */ continue; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || /* Note: packet_read_pending() might be slow if we have * to call it as it's per_cpu variable, but in fast-path * we already short-circuit the loop with the first * condition, and luckily don't have to go that path * anyway. */ (need_wait && packet_read_pending(&po->tx_ring)))); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; po = pkt_sk(sk); err = packet_alloc_pending(po); if (err) goto out_sk_free; sock_init_data(sock, sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out_sk_free: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = (sock->type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); if (dev) { aux.tp_vlan_tci = vlan_get_tci(skb, dev); aux.tp_vlan_tpid = ntohs(skb->protocol); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { packet_dev_mc(dev, ml, -1); *mlp = ml->next; kfree(ml); } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } break; } } rtnl_unlock(); return 0; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: if (optlen < sizeof(req_u.req)) break; ret = copy_from_sockptr(&req_u.req, optval, sizeof(req_u.req)) ? -EINVAL : 0; break; case TPACKET_V3: default: if (optlen < sizeof(req_u.req3)) break; ret = copy_from_sockptr(&req_u.req3, optval, sizeof(req_u.req3)) ? -EINVAL : 0; break; } if (!ret) ret = packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); release_sock(sk); return ret; } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: break; default: return -EINVAL; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_version = val; ret = 0; } release_sock(sk); return ret; } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_reserve = val; ret = 0; } release_sock(sk); return ret; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); return ret; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: case PACKET_VNET_HDR_SZ: { int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (optname == PACKET_VNET_HDR_SZ) { if (val && val != sizeof(struct virtio_net_hdr) && val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) return -EINVAL; hdr_len = val; } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); return ret; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: { struct fanout_args args = { 0 }; if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); } case PACKET_IGNORE_OUTGOING: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_drops = drops; st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_drops = drops; st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_COPY_THRESH: val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_IGNORE_OUTGOING: val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; rstats.tp_all = atomic_long_read(&po->rollover->num); rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); data = &rstats; lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= EPOLLOUT | EPOLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; } if (req->tp_block_nr) { unsigned int min_frame_size; /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && req->tp_block_size < BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Block transmit is not supported yet */ if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); } else { struct tpacket_req3 *req3 = &req_u->req3; if (req3->tp_retire_blk_tov || req3->tp_sizeof_priv || req3->tp_feature_req_word) { err = -EINVAL; goto out_free_pg_vec; } } break; default: if (!tx_ring) { rx_owner_map = bitmap_alloc(req->tp_frame_nr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); if (!rx_owner_map) goto out_free_pg_vec; } break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; if (was_running) { WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); if (po->tp_version <= TPACKET_V2) swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_long_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %ld\n", atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); if (was_running) { WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } out_free_pg_vec: if (pg_vec) { bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); } out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%*sRefCnt Type Proto Iface R Rmem User Inode\n", IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); #ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; rc = register_pernet_subsys(&packet_net_ops); if (rc) goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; rc = proto_register(&packet_proto, 0); if (rc) goto out_notifier; rc = sock_register(&packet_family_ops); if (rc) goto out_proto; return 0; out_proto: proto_unregister(&packet_proto); out_notifier: unregister_netdevice_notifier(&packet_netdev_notifier); out_pernet: unregister_pernet_subsys(&packet_net_ops); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_IODEV_H__ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> #include <linux/errno.h> struct kvm_io_device; struct kvm_vcpu; /** * kvm_io_device_ops are called under kvm slots_lock. * read and write handlers return 0 if the transaction has been handled, * or non-zero to have it passed to the next device. **/ struct kvm_io_device_ops { int (*read)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val); int (*write)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val); void (*destructor)(struct kvm_io_device *this); }; struct kvm_io_device { const struct kvm_io_device_ops *ops; }; static inline void kvm_iodevice_init(struct kvm_io_device *dev, const struct kvm_io_device_ops *ops) { dev->ops = ops; } static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, void *v) { return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, const void *v) { return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } #endif /* __KVM_IODEV_H__ */
200 199 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .init = safesetid_security_init, .name = "safesetid", };
175 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
135 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } static inline bool timespec64_is_epoch(const struct timespec64 *ts) { return ts->tv_sec == 0 && ts->tv_nsec == 0; } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */
1 67 37 37 67 67 66 1 1 1 1 83 67 67 74 75 235 235 20 20 74 76 12 12 12 12 11 11 75 79 79 76 76 75 76 37 37 37 37 338 337 27 27 2 3 233 238 238 235 235 235 20 20 20 67 67 67 67 75 75 74 74 83 83 1 60 59 83 18 4 74 13 13 76 77 77 13 76 76 76 3 3 75 75 76 1 76 76 75 59 85 85 85 86 86 3 84 65 83 10 55 83 75 31 85 26 26 73 4 63 1 67 67 76 76 4 72 29 71 71 71 7 75 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/io.h> #include <linux/hugetlb.h> #include <linux/sched/signal.h> #include <trace/events/kvm.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/kvm_pkvm.h> #include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/virt.h> #include "trace.h" static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); static unsigned long __ro_after_init hyp_idmap_start; static unsigned long __ro_after_init hyp_idmap_end; static phys_addr_t __ro_after_init hyp_idmap_vector; u32 __ro_after_init __hyp_va_bits; static unsigned long __ro_after_init io_map_base; #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, phys_addr_t size) { phys_addr_t boundary = ALIGN_DOWN(addr + size, size); return (boundary - 1 < end - 1) ? boundary : end; } static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) { phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); return __stage2_range_addr_end(addr, end, size); } /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too * long will also starve other vCPUs. We have to also make sure that the page * tables are not freed while we released the lock. */ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end, int (*fn)(struct kvm_pgtable *, u64, u64), bool resched) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); int ret; u64 next; do { struct kvm_pgtable *pgt = mmu->pgt; if (!pgt) return -EINVAL; next = stage2_range_addr_end(addr, end); ret = fn(pgt, addr, next - addr); if (ret) break; if (resched && next != end) cond_resched_rwlock_write(&kvm->mmu_lock); } while (addr = next, addr != end); return ret; } #define stage2_apply_range_resched(mmu, addr, end, fn) \ stage2_apply_range(mmu, addr, end, fn, true) /* * Get the maximum number of page-tables pages needed to split a range * of blocks into PAGE_SIZE PTEs. It assumes the range is already * mapped at level 2, or at level 1 if allowed. */ static int kvm_mmu_split_nr_page_tables(u64 range) { int n = 0; if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) n += DIV_ROUND_UP(range, PUD_SIZE); n += DIV_ROUND_UP(range, PMD_SIZE); return n; } static bool need_split_memcache_topup_or_resched(struct kvm *kvm) { struct kvm_mmu_memory_cache *cache; u64 chunk_size, min; if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) return true; chunk_size = kvm->arch.mmu.split_page_chunk_size; min = kvm_mmu_split_nr_page_tables(chunk_size); cache = &kvm->arch.mmu.split_page_cache; return kvm_mmu_memory_cache_nr_free_objects(cache) < min; } static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { struct kvm_mmu_memory_cache *cache; struct kvm_pgtable *pgt; int ret, cache_capacity; u64 next, chunk_size; lockdep_assert_held_write(&kvm->mmu_lock); chunk_size = kvm->arch.mmu.split_page_chunk_size; cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); if (chunk_size == 0) return 0; cache = &kvm->arch.mmu.split_page_cache; do { if (need_split_memcache_topup_or_resched(kvm)) { write_unlock(&kvm->mmu_lock); cond_resched(); /* Eager page splitting is best-effort. */ ret = __kvm_mmu_topup_memory_cache(cache, cache_capacity, cache_capacity); write_lock(&kvm->mmu_lock); if (ret) break; } pgt = kvm->arch.mmu.pgt; if (!pgt) return -EINVAL; next = __stage2_range_addr_end(addr, end, chunk_size); ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); if (ret) break; } while (addr = next, addr != end); return ret; } static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); } /** * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 * @kvm: pointer to kvm structure. * * Interface to HYP function to flush all VM TLB entries */ int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (is_protected_kvm_enabled()) kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); else kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); return 0; } int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { u64 size = nr_pages << PAGE_SHIFT; u64 addr = gfn << PAGE_SHIFT; if (is_protected_kvm_enabled()) kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); else kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); return 0; } static bool kvm_is_device_pfn(unsigned long pfn) { return !pfn_is_map_memory(pfn); } static void *stage2_memcache_zalloc_page(void *arg) { struct kvm_mmu_memory_cache *mc = arg; void *virt; /* Allocated with __GFP_ZERO, so no need to zero */ virt = kvm_mmu_memory_cache_alloc(mc); if (virt) kvm_account_pgtable_pages(virt, 1); return virt; } static void *kvm_host_zalloc_pages_exact(size_t size) { return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } static void *kvm_s2_zalloc_pages_exact(size_t size) { void *virt = kvm_host_zalloc_pages_exact(size); if (virt) kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); return virt; } static void kvm_s2_free_pages_exact(void *virt, size_t size) { kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); free_pages_exact(virt, size); } static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); s8 level = page_private(page); KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); } static void stage2_free_unlinked_table(void *addr, s8 level) { struct page *page = virt_to_page(addr); set_page_private(page, (unsigned long)level); call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); } static void kvm_host_get_page(void *addr) { get_page(virt_to_page(addr)); } static void kvm_host_put_page(void *addr) { put_page(virt_to_page(addr)); } static void kvm_s2_put_page(void *addr) { struct page *p = virt_to_page(addr); /* Dropping last refcount, the page will be freed */ if (page_count(p) == 1) kvm_account_pgtable_pages(addr, -1); put_page(p); } static int kvm_host_page_count(void *addr) { return page_count(virt_to_page(addr)); } static phys_addr_t kvm_host_pa(void *addr) { return __pa(addr); } static void *kvm_host_va(phys_addr_t phys) { return __va(phys); } static void clean_dcache_guest_page(void *va, size_t size) { __clean_dcache_guest_page(va, size); } static void invalidate_icache_guest_page(void *va, size_t size) { __invalidate_icache_guest_page(va, size); } /* * Unmapping vs dcache management: * * If a guest maps certain memory pages as uncached, all writes will * bypass the data cache and go directly to RAM. However, the CPUs * can still speculate reads (not writes) and fill cache lines with * data. * * Those cache lines will be *clean* cache lines though, so a * clean+invalidate operation is equivalent to an invalidate * operation, because no cache lines are marked dirty. * * Those clean cache lines could be filled prior to an uncached write * by the guest, and the cache coherent IO subsystem would therefore * end up writing old data to disk. * * This is why right after unmapping a page/section and invalidating * the corresponding TLBs, we flush to make sure the IO subsystem will * never hit in the cache. * * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as * we then fully enforce cacheability of RAM, no matter what the guest * does. */ /** * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap * @may_block: Whether or not we are permitted to block * * Clear a range of stage-2 mappings, lowering the various ref-counts. Must * be called while holding mmu_lock (unless for freeing the stage2 pgd before * destroying the VM), otherwise another faulting VCPU may come in and mess * with things behind our backs. */ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, bool may_block) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); phys_addr_t end = start + size; lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), may_block)); } void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, bool may_block) { __unmap_stage2_range(mmu, start, size, may_block); } void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); } static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); } /** * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 * @kvm: The struct kvm pointer * * Go through the stage 2 page tables and invalidate any cache lines * backing memory already mapped to the VM. */ static void stage2_flush_vm(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int idx, bkt; idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, bkt, slots) stage2_flush_memslot(kvm, memslot); kvm_nested_s2_flush(kvm); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } /** * free_hyp_pgds - free Hyp-mode page tables */ void __init free_hyp_pgds(void) { mutex_lock(&kvm_hyp_pgd_mutex); if (hyp_pgtable) { kvm_pgtable_hyp_destroy(hyp_pgtable); kfree(hyp_pgtable); hyp_pgtable = NULL; } mutex_unlock(&kvm_hyp_pgd_mutex); } static bool kvm_host_owns_hyp_mappings(void) { if (is_kernel_in_hyp_mode()) return false; if (static_branch_likely(&kvm_protected_mode_initialized)) return false; /* * This can happen at boot time when __create_hyp_mappings() is called * after the hyp protection has been enabled, but the static key has * not been flipped yet. */ if (!hyp_pgtable && is_protected_kvm_enabled()) return false; WARN_ON(!hyp_pgtable); return true; } int __create_hyp_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { int err; if (WARN_ON(!kvm_host_owns_hyp_mappings())) return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); mutex_unlock(&kvm_hyp_pgd_mutex); return err; } static phys_addr_t kvm_kaddr_to_phys(void *kaddr) { if (!is_vmalloc_addr(kaddr)) { BUG_ON(!virt_addr_valid(kaddr)); return __pa(kaddr); } else { return page_to_phys(vmalloc_to_page(kaddr)) + offset_in_page(kaddr); } } struct hyp_shared_pfn { u64 pfn; int count; struct rb_node node; }; static DEFINE_MUTEX(hyp_shared_pfns_lock); static struct rb_root hyp_shared_pfns = RB_ROOT; static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, struct rb_node **parent) { struct hyp_shared_pfn *this; *node = &hyp_shared_pfns.rb_node; *parent = NULL; while (**node) { this = container_of(**node, struct hyp_shared_pfn, node); *parent = **node; if (this->pfn < pfn) *node = &((**node)->rb_left); else if (this->pfn > pfn) *node = &((**node)->rb_right); else return this; } return NULL; } static int share_pfn_hyp(u64 pfn) { struct rb_node **node, *parent; struct hyp_shared_pfn *this; int ret = 0; mutex_lock(&hyp_shared_pfns_lock); this = find_shared_pfn(pfn, &node, &parent); if (this) { this->count++; goto unlock; } this = kzalloc(sizeof(*this), GFP_KERNEL); if (!this) { ret = -ENOMEM; goto unlock; } this->pfn = pfn; this->count = 1; rb_link_node(&this->node, parent, node); rb_insert_color(&this->node, &hyp_shared_pfns); ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); unlock: mutex_unlock(&hyp_shared_pfns_lock); return ret; } static int unshare_pfn_hyp(u64 pfn) { struct rb_node **node, *parent; struct hyp_shared_pfn *this; int ret = 0; mutex_lock(&hyp_shared_pfns_lock); this = find_shared_pfn(pfn, &node, &parent); if (WARN_ON(!this)) { ret = -ENOENT; goto unlock; } this->count--; if (this->count) goto unlock; rb_erase(&this->node, &hyp_shared_pfns); kfree(this); ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); unlock: mutex_unlock(&hyp_shared_pfns_lock); return ret; } int kvm_share_hyp(void *from, void *to) { phys_addr_t start, end, cur; u64 pfn; int ret; if (is_kernel_in_hyp_mode()) return 0; /* * The share hcall maps things in the 'fixed-offset' region of the hyp * VA space, so we can only share physically contiguous data-structures * for now. */ if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) return -EINVAL; if (kvm_host_owns_hyp_mappings()) return create_hyp_mappings(from, to, PAGE_HYP); start = ALIGN_DOWN(__pa(from), PAGE_SIZE); end = PAGE_ALIGN(__pa(to)); for (cur = start; cur < end; cur += PAGE_SIZE) { pfn = __phys_to_pfn(cur); ret = share_pfn_hyp(pfn); if (ret) return ret; } return 0; } void kvm_unshare_hyp(void *from, void *to) { phys_addr_t start, end, cur; u64 pfn; if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) return; start = ALIGN_DOWN(__pa(from), PAGE_SIZE); end = PAGE_ALIGN(__pa(to)); for (cur = start; cur < end; cur += PAGE_SIZE) { pfn = __phys_to_pfn(cur); WARN_ON(unshare_pfn_hyp(pfn)); } } /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range * @to: The virtual kernel end address of the range (exclusive) * @prot: The protection to be applied to this range * * The same virtual address as the kernel virtual address is also used * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. */ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) { phys_addr_t phys_addr; unsigned long virt_addr; unsigned long start = kern_hyp_va((unsigned long)from); unsigned long end = kern_hyp_va((unsigned long)to); if (is_kernel_in_hyp_mode()) return 0; if (!kvm_host_owns_hyp_mappings()) return -EPERM; start = start & PAGE_MASK; end = PAGE_ALIGN(end); for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, prot); if (err) return err; } return 0; } static int __hyp_alloc_private_va_range(unsigned long base) { lockdep_assert_held(&kvm_hyp_pgd_mutex); if (!PAGE_ALIGNED(base)) return -EINVAL; /* * Verify that BIT(VA_BITS - 1) hasn't been flipped by * allocating the new area, as it would indicate we've * overflowed the idmap/IO address range. */ if ((base ^ io_map_base) & BIT(VA_BITS - 1)) return -ENOMEM; io_map_base = base; return 0; } /** * hyp_alloc_private_va_range - Allocates a private VA range. * @size: The size of the VA range to reserve. * @haddr: The hypervisor virtual start address of the allocation. * * The private virtual address (VA) range is allocated below io_map_base * and aligned based on the order of @size. * * Return: 0 on success or negative error code on failure. */ int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) { unsigned long base; int ret = 0; mutex_lock(&kvm_hyp_pgd_mutex); /* * This assumes that we have enough space below the idmap * page to allocate our VAs. If not, the check in * __hyp_alloc_private_va_range() will kick. A potential * alternative would be to detect that overflow and switch * to an allocation above the idmap. * * The allocated size is always a multiple of PAGE_SIZE. */ size = PAGE_ALIGN(size); base = io_map_base - size; ret = __hyp_alloc_private_va_range(base); mutex_unlock(&kvm_hyp_pgd_mutex); if (!ret) *haddr = base; return ret; } static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, unsigned long *haddr, enum kvm_pgtable_prot prot) { unsigned long addr; int ret = 0; if (!kvm_host_owns_hyp_mappings()) { addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, phys_addr, size, prot); if (IS_ERR_VALUE(addr)) return addr; *haddr = addr; return 0; } size = PAGE_ALIGN(size + offset_in_page(phys_addr)); ret = hyp_alloc_private_va_range(size, &addr); if (ret) return ret; ret = __create_hyp_mappings(addr, size, phys_addr, prot); if (ret) return ret; *haddr = addr + offset_in_page(phys_addr); return ret; } int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) { unsigned long base; size_t size; int ret; mutex_lock(&kvm_hyp_pgd_mutex); /* * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ size = NVHE_STACK_SIZE * 2; base = ALIGN_DOWN(io_map_base - size, size); ret = __hyp_alloc_private_va_range(base); mutex_unlock(&kvm_hyp_pgd_mutex); if (ret) { kvm_err("Cannot allocate hyp stack guard page\n"); return ret; } /* * Since the stack grows downwards, map the stack to the page * at the higher address and leave the lower guard page * unbacked. * * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, phys_addr, PAGE_HYP); if (ret) kvm_err("Cannot map hyp stack\n"); *haddr = base + size; return ret; } /** * create_hyp_io_mappings - Map IO into both kernel and HYP * @phys_addr: The physical start address which gets mapped * @size: Size of the region being mapped * @kaddr: Kernel VA for this mapping * @haddr: HYP VA for this mapping */ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr) { unsigned long addr; int ret; if (is_protected_kvm_enabled()) return -EPERM; *kaddr = ioremap(phys_addr, size); if (!*kaddr) return -ENOMEM; if (is_kernel_in_hyp_mode()) { *haddr = *kaddr; return 0; } ret = __create_hyp_private_mapping(phys_addr, size, &addr, PAGE_HYP_DEVICE); if (ret) { iounmap(*kaddr); *kaddr = NULL; *haddr = NULL; return ret; } *haddr = (void __iomem *)addr; return 0; } /** * create_hyp_exec_mappings - Map an executable range into HYP * @phys_addr: The physical start address which gets mapped * @size: Size of the region being mapped * @haddr: HYP VA for this mapping */ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr) { unsigned long addr; int ret; BUG_ON(is_kernel_in_hyp_mode()); ret = __create_hyp_private_mapping(phys_addr, size, &addr, PAGE_HYP_EXEC); if (ret) { *haddr = NULL; return ret; } *haddr = (void *)addr; return 0; } static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { /* We shouldn't need any other callback to walk the PT */ .phys_to_virt = kvm_host_va, }; static int get_user_mapping_size(struct kvm *kvm, u64 addr) { struct kvm_pgtable pgt = { .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = vabits_actual, .start_level = (KVM_PGTABLE_LAST_LEVEL - ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), .mm_ops = &kvm_user_mm_ops, }; unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ s8 level = S8_MAX; int ret; /* * Disable IRQs so that we hazard against a concurrent * teardown of the userspace page tables (which relies on * IPI-ing threads). */ local_irq_save(flags); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); local_irq_restore(flags); if (ret) return ret; /* * Not seeing an error, but not updating level? Something went * deeply wrong... */ if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) return -EFAULT; if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) return -EFAULT; /* Oops, the userspace PTs are gone... Replay the fault */ if (!kvm_pte_valid(pte)) return -EAGAIN; return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, .free_unlinked_table = stage2_free_unlinked_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, .phys_to_virt = kvm_host_va, .virt_to_phys = kvm_host_pa, .dcache_clean_inval_poc = clean_dcache_guest_page, .icache_inval_pou = invalidate_icache_guest_page, }; static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); u64 mmfr0, mmfr1; u32 phys_shift; if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) return -EINVAL; phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (is_protected_kvm_enabled()) { phys_shift = kvm_ipa_limit; } else if (phys_shift) { if (phys_shift > kvm_ipa_limit || phys_shift < ARM64_MIN_PARANGE_BITS) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; if (phys_shift > kvm_ipa_limit) { pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", current->comm); return -EINVAL; } } mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); return 0; } /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure * @type: The machine type of the virtual machine * * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called in two cases: * * - when the VM is created, which can't race against anything * * - when secondary kvm_s2_mmu structures are initialised for NV * guests, and the caller must hold kvm->lock as this is called on a * per-vcpu basis. */ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) { int cpu, err; struct kvm_pgtable *pgt; /* * If we already have our page tables in place, and that the * MMU context is the canonical one, we have a bug somewhere, * as this is only supposed to ever happen once per VM. * * Otherwise, we're building nested page tables, and that's * probably because userspace called KVM_ARM_VCPU_INIT more * than once on the same vcpu. Since that's actually legal, * don't kick a fuss and leave gracefully. */ if (mmu->pgt != NULL) { if (kvm_is_nested_s2_mmu(kvm, mmu)) return 0; kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } err = kvm_init_ipa_range(mmu, type); if (err) return err; pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; mmu->arch = &kvm->arch; err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; mmu->pgt = pgt; if (is_protected_kvm_enabled()) return 0; mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { err = -ENOMEM; goto out_destroy_pgtable; } for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; /* The eager page splitting is disabled by default */ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; mmu->split_page_cache.gfp_zero = __GFP_ZERO; mmu->pgd_phys = __pa(pgt->pgd); if (kvm_is_nested_s2_mmu(kvm, mmu)) kvm_init_nested_s2_mmu(mmu); return 0; out_destroy_pgtable: KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; } void kvm_uninit_stage2_mmu(struct kvm *kvm) { kvm_free_stage2_pgd(&kvm->arch.mmu); kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); } static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { hva_t hva = memslot->userspace_addr; phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t size = PAGE_SIZE * memslot->npages; hva_t reg_end = hva + size; /* * A memory region could potentially cover multiple VMAs, and any holes * between them, so iterate over all of them to find out if we should * unmap any of them. * * +--------------------------------------------+ * +---------------+----------------+ +----------------+ * | : VMA 1 | VMA 2 | | VMA 3 : | * +---------------+----------------+ +----------------+ * | memory region | * +--------------------------------------------+ */ do { struct vm_area_struct *vma; hva_t vm_start, vm_end; vma = find_vma_intersection(current->mm, hva, reg_end); if (!vma) break; /* * Take the intersection of this VMA with the memory region */ vm_start = max(hva, vma->vm_start); vm_end = min(reg_end, vma->vm_end); if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); } /** * stage2_unmap_vm - Unmap Stage-2 RAM mappings * @kvm: The struct kvm pointer * * Go through the memregions and unmap any regular RAM * backing memory already mapped to the VM. */ void stage2_unmap_vm(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int idx, bkt; idx = srcu_read_lock(&kvm->srcu); mmap_read_lock(current->mm); write_lock(&kvm->mmu_lock); slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); srcu_read_unlock(&kvm->srcu, idx); } void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); struct kvm_pgtable *pgt = NULL; write_lock(&kvm->mmu_lock); pgt = mmu->pgt; if (pgt) { mmu->pgd_phys = 0; mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } write_unlock(&kvm->mmu_lock); if (pgt) { KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } static void hyp_mc_free_fn(void *addr, void *unused) { free_page((unsigned long)addr); } static void *hyp_mc_alloc_fn(void *unused) { return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); } void free_hyp_memcache(struct kvm_hyp_memcache *mc) { if (!is_protected_kvm_enabled()) return; kfree(mc->mapping); __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) { if (!is_protected_kvm_enabled()) return 0; if (!mc->mapping) { mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); if (!mc->mapping) return -ENOMEM; } return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, kvm_host_pa, NULL); } /** * kvm_phys_addr_ioremap - map a device range to guest IPA * * @kvm: The KVM pointer * @guest_ipa: The IPA at which to insert the mapping * @pa: The physical address of the device * @size: The size of the mapping * @writable: Whether or not to create a writable mapping */ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable) { phys_addr_t addr; int ret = 0; struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; struct kvm_s2_mmu *mmu = &kvm->arch.mmu; struct kvm_pgtable *pgt = mmu->pgt; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_R | (writable ? KVM_PGTABLE_PROT_W : 0); if (is_protected_kvm_enabled()) return -EPERM; size += offset_in_page(guest_ipa); guest_ipa &= PAGE_MASK; for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { ret = kvm_mmu_topup_memory_cache(&cache, kvm_mmu_cache_min_pages(mmu)); if (ret) break; write_lock(&kvm->mmu_lock); ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, pa, prot, &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; pa += PAGE_SIZE; } kvm_mmu_free_memory_cache(&cache); return ret; } /** * kvm_stage2_wp_range() - write protect stage2 memory region range * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); } /** * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot * @kvm: The KVM pointer * @slot: The memory slot to write protect * * Called to start logging dirty pages after memory region * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns * all present PUD, PMD and PTEs are write protected in the memory region. * Afterwards read of dirty page log can be called. * * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, * serializing operations for VM memory regions. */ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); phys_addr_t start, end; if (WARN_ON_ONCE(!memslot)) return; start = memslot->base_gfn << PAGE_SHIFT; end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; write_lock(&kvm->mmu_lock); kvm_stage2_wp_range(&kvm->arch.mmu, start, end); kvm_nested_s2_wp(kvm); write_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs_memslot(kvm, memslot); } /** * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE * pages for memory slot * @kvm: The KVM pointer * @slot: The memory slot to split * * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, * serializing operations for VM memory regions. */ static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; phys_addr_t start, end; lockdep_assert_held(&kvm->slots_lock); slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, slot); start = memslot->base_gfn << PAGE_SHIFT; end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; write_lock(&kvm->mmu_lock); kvm_mmu_split_huge_pages(kvm, start, end); write_unlock(&kvm->mmu_lock); } /* * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. * @kvm: The KVM pointer * @slot: The memory slot associated with mask * @gfn_offset: The gfn offset in memory slot * @mask: The mask of pages at offset 'gfn_offset' in this memory * slot to enable dirty logging on * * Writes protect selected pages to enable dirty logging, and then * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { phys_addr_t base_gfn = slot->base_gfn + gfn_offset; phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; lockdep_assert_held_write(&kvm->mmu_lock); kvm_stage2_wp_range(&kvm->arch.mmu, start, end); /* * Eager-splitting is done when manual-protect is set. We * also check for initially-all-set because we can avoid * eager-splitting if initially-all-set is false. * Initially-all-set equal false implies that huge-pages were * already split when enabling dirty logging: no need to do it * again. */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_split_huge_pages(kvm, start, end); kvm_nested_s2_wp(kvm); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva, unsigned long map_size) { gpa_t gpa_start; hva_t uaddr_start, uaddr_end; size_t size; /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ if (map_size == PAGE_SIZE) return true; size = memslot->npages * PAGE_SIZE; gpa_start = memslot->base_gfn << PAGE_SHIFT; uaddr_start = memslot->userspace_addr; uaddr_end = uaddr_start + size; /* * Pages belonging to memslots that don't have the same alignment * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 * PMD/PUD entries, because we'll end up mapping the wrong pages. * * Consider a layout like the following: * * memslot->userspace_addr: * +-----+--------------------+--------------------+---+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| * +-----+--------------------+--------------------+---+ * * memslot->base_gfn << PAGE_SHIFT: * +---+--------------------+--------------------+-----+ * |abc|def Stage-2 block | Stage-2 block |tvxyz| * +---+--------------------+--------------------+-----+ * * If we create those stage-2 blocks, we'll end up with this incorrect * mapping: * d -> f * e -> g * f -> h */ if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) return false; /* * Next, let's make sure we're not trying to map anything not covered * by the memslot. This means we have to prohibit block size mappings * for the beginning and end of a non-block aligned and non-block sized * memory slot (illustrated by the head and tail parts of the * userspace view above containing pages 'abcde' and 'xyz', * respectively). * * Note that it doesn't matter if we do the check using the * userspace_addr or the base_gfn, as both are equally aligned (per * the check above) and equally sized. */ return (hva & ~(map_size - 1)) >= uaddr_start && (hva & ~(map_size - 1)) + map_size <= uaddr_end; } /* * Check if the given hva is backed by a transparent huge page (THP) and * whether it can be mapped using block mapping in stage2. If so, adjust * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently * supported. This will need to be updated to support other THP sizes. * * Returns the size of the mapping. */ static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) { kvm_pfn_t pfn = *pfnp; /* * Make sure the adjustment is done only for THP pages. Also make * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { int sz = get_user_mapping_size(kvm, hva); if (sz < 0) return sz; if (sz < PMD_SIZE) return PAGE_SIZE; *ipap &= PMD_MASK; pfn &= ~(PTRS_PER_PMD - 1); *pfnp = pfn; return PMD_SIZE; } /* Use page mapping if we cannot use block mapping. */ return PAGE_SIZE; } static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) { unsigned long pa; if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) return huge_page_shift(hstate_vma(vma)); if (!(vma->vm_flags & VM_PFNMAP)) return PAGE_SHIFT; VM_BUG_ON(is_vm_hugetlb_page(vma)); pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); #ifndef __PAGETABLE_PMD_FOLDED if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && ALIGN(hva, PUD_SIZE) <= vma->vm_end) return PUD_SHIFT; #endif if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && ALIGN(hva, PMD_SIZE) <= vma->vm_end) return PMD_SHIFT; return PAGE_SHIFT; } /* * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be * able to see the page's tags and therefore they must be initialised first. If * PG_mte_tagged is set, tags have already been initialised. * * The race in the test/set of the PG_mte_tagged flag is handled by: * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs * racing to santise the same page * - mmap_lock protects between a VM faulting a page in and the VMM performing * an mprotect() to add VM_MTE */ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, unsigned long size) { unsigned long i, nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(pfn); struct folio *folio = page_folio(page); if (!kvm_has_mte(kvm)) return; if (folio_test_hugetlb(folio)) { /* Hugetlb has MTE flags set on head page only */ if (folio_try_hugetlb_mte_tagging(folio)) { for (i = 0; i < nr_pages; i++, page++) mte_clear_page_tags(page_address(page)); folio_set_hugetlb_mte_tagged(folio); } return; } for (i = 0; i < nr_pages; i++, page++) { if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } } } static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) { return vma->vm_flags & VM_MTE_ALLOWED; } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, bool fault_is_perm) { int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault, mte_allowed; bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; struct vm_area_struct *vma; short vma_shift; void *memcache; gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); if (fault_is_perm && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } /* * Permission faults just need to update the existing leaf entry, * and so normally don't require allocations from the memcache. The * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ if (!fault_is_perm || (logging_active && write_fault)) { int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); if (!is_protected_kvm_enabled()) { memcache = &vcpu->arch.mmu_page_cache; ret = kvm_mmu_topup_memory_cache(memcache, min_pages); } else { memcache = &vcpu->arch.pkvm_memcache; ret = topup_hyp_memcache(memcache, min_pages); } if (ret) return ret; } /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ mmap_read_lock(current->mm); vma = vma_lookup(current->mm, hva); if (unlikely(!vma)) { kvm_err("Failed to find VMA for hva 0x%lx\n", hva); mmap_read_unlock(current->mm); return -EFAULT; } /* * logging_active is guaranteed to never be true for VM_PFNMAP * memslots. */ if (logging_active || is_protected_kvm_enabled()) { force_pte = true; vma_shift = PAGE_SHIFT; } else { vma_shift = get_vma_page_shift(vma, hva); } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) break; fallthrough; #endif case CONT_PMD_SHIFT: vma_shift = PMD_SHIFT; fallthrough; case PMD_SHIFT: if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) break; fallthrough; case CONT_PTE_SHIFT: vma_shift = PAGE_SHIFT; force_pte = true; fallthrough; case PAGE_SHIFT: break; default: WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); } vma_pagesize = 1UL << vma_shift; if (nested) { unsigned long max_map_size; max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; ipa = kvm_s2_trans_output(nested); /* * If we're about to create a shadow stage 2 entry, then we * can only create a block mapping if the guest stage 2 page * table uses at least as big a mapping. */ max_map_size = min(kvm_s2_trans_size(nested), max_map_size); /* * Be careful that if the mapping size falls between * two host sizes, take the smallest of the two. */ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) max_map_size = PMD_SIZE; else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) max_map_size = PAGE_SIZE; force_pte = (max_map_size == PAGE_SIZE); vma_pagesize = min(vma_pagesize, (long)max_map_size); } /* * Both the canonical IPA and fault IPA must be hugepage-aligned to * ensure we find the right PFN and lay down the mapping in the right * place. */ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); ipa &= ~(vma_pagesize - 1); } gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; /* Don't use the VMA after the unlock -- it may have vanished */ vma = NULL; /* * Read mmu_invalidate_seq so that KVM can detect if the results of * vma_lookup() or __kvm_faultin_pfn() become stale prior to * acquiring kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * with the smp_wmb() in kvm_mmu_invalidate_end(). */ mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; } if (is_error_noslot_pfn(pfn)) return -EFAULT; if (kvm_is_device_pfn(pfn)) { /* * If the page was identified as device early by looking at * the VMA flags, vma_pagesize is already representing the * largest quantity we can map. If instead it was mapped * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() * change things at the last minute. */ device = true; } else if (logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write * fault. */ writable = false; } if (exec_fault && device) return -ENOEXEC; /* * Potentially reduce shadow S2 permissions to match the guest's own * S2. For exec faults, we'd only reach this point if the guest * actually allowed it (see kvm_s2_handle_perm_fault). * * Also encode the level of the original translation in the SW bits * of the leaf entry as a proxy for the span of that translation. * This will be retrieved on TLB invalidation from the guest and * used to limit the invalidation scope if a TTL hint or a range * isn't provided. */ if (nested) { writable &= kvm_s2_trans_writable(nested); if (!kvm_s2_trans_readable(nested)) prot &= ~KVM_PGTABLE_PROT_R; prot |= kvm_encode_nested_level(nested); } kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; goto out_unlock; } /* * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { if (fault_is_perm && fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); if (vma_pagesize < 0) { ret = vma_pagesize; goto out_unlock; } } if (!fault_is_perm && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); } else { ret = -EFAULT; goto out_unlock; } } if (writable) prot |= KVM_PGTABLE_PROT_W; if (exec_fault) prot |= KVM_PGTABLE_PROT_X; if (device) { if (vfio_allow_any_uc) prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && (!nested || kvm_s2_trans_executable(nested))) { prot |= KVM_PGTABLE_PROT_X; } /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ if (fault_is_perm && vma_pagesize == fault_granule) { /* * Drop the SW bits in favour of those stored in the * PTE, which will be preserved. */ prot &= ~KVM_NV_GUEST_MAP_SZ; ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); } else { ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, memcache, flags); } out_unlock: kvm_release_faultin_page(kvm, page, !!ret, writable); kvm_fault_unlock(kvm); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); return ret != -EAGAIN ? ret : 0; } /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); read_unlock(&vcpu->kvm->mmu_lock); } /** * kvm_handle_guest_abort - handles all 2nd stage aborts * @vcpu: the VCPU pointer * * Any abort that gets to the host is almost guaranteed to be caused by a * missing second stage translation table entry, which can mean that either the * guest simply needs more memory and we must allocate an appropriate page or it * can mean that the guest tried to access I/O memory, which is emulated by user * space. The distinction is based on the IPA causing the fault and whether this * memory region has been registered as standard RAM by user space. */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { struct kvm_s2_trans nested_trans, *nested = NULL; unsigned long esr; phys_addr_t fault_ipa; /* The address we faulted on */ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ struct kvm_memory_slot *memslot; unsigned long hva; bool is_iabt, write_fault, writable; gfn_t gfn; int ret, idx; esr = kvm_vcpu_get_esr(vcpu); ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); if (esr_fsc_is_translation_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); return 1; } /* Falls between the IPA range and the PARange? */ if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); if (is_iabt) kvm_inject_pabt(vcpu, fault_ipa); else kvm_inject_dabt(vcpu, fault_ipa); return 1; } } /* Synchronous External Abort? */ if (kvm_vcpu_abt_issea(vcpu)) { /* * For RAS the host kernel may handle this abort. * There is no need to pass the error into the guest. */ if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) kvm_inject_vabt(vcpu); return 1; } trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ if (!esr_fsc_is_translation_fault(esr) && !esr_fsc_is_permission_fault(esr) && !esr_fsc_is_access_flag_fault(esr)) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), (unsigned long)kvm_vcpu_get_esr(vcpu)); return -EFAULT; } idx = srcu_read_lock(&vcpu->kvm->srcu); /* * We may have faulted on a shadow stage 2 page table if we are * running a nested guest. In this case, we have to resolve the L2 * IPA to the L1 IPA first, before knowing what kind of memory should * back the L1 IPA. * * If the shadow stage 2 page table walk faults, then we simply inject * this to the guest and carry on. * * If there are no shadow S2 PTs because S2 is disabled, there is * nothing to walk and we treat it as a 1:1 before going through the * canonical translation. */ if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && vcpu->arch.hw_mmu->nested_stage2_enabled) { u32 esr; ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); if (ret) { esr = kvm_s2_trans_esr(&nested_trans); kvm_inject_s2_fault(vcpu, esr); goto out_unlock; } ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); if (ret) { esr = kvm_s2_trans_esr(&nested_trans); kvm_inject_s2_fault(vcpu, esr); goto out_unlock; } ipa = kvm_s2_trans_output(&nested_trans); nested = &nested_trans; } gfn = ipa >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); if (kvm_is_error_hva(hva) || (write_fault && !writable)) { /* * The guest has put either its instructions or its page-tables * somewhere it shouldn't have. Userspace won't be able to do * anything about this (there's no syndrome for a start), so * re-inject the abort back into the guest. */ if (is_iabt) { ret = -ENOEXEC; goto out; } if (kvm_vcpu_abt_iss1tw(vcpu)) { kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); ret = 1; goto out_unlock; } /* * Check for a cache maintenance operation. Since we * ended-up here, we know it is outside of any memory * slot. But we can't find out if that is for a device, * or if the guest is just being stupid. The only thing * we know for sure is that this range cannot be cached. * * So let's assume that the guest is just being * cautious, and skip the instruction. */ if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { kvm_incr_pc(vcpu); ret = 1; goto out_unlock; } /* * The IPA is reported as [MAX:12], so we need to * complement it with the bottom 12 bits from the * faulting VA. This is always 12 bits, irrespective * of the page size. */ ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); ret = io_mem_abort(vcpu, ipa); goto out_unlock; } /* Userspace should not be able to register out-of-bounds IPAs */ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; } ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: if (ret == -ENOEXEC) { kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); ret = 1; } out_unlock: srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) return false; __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); kvm_nested_s2_unmap(kvm, range->may_block); return false; } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { u64 size = (range->end - range->start) << PAGE_SHIFT; if (!kvm->arch.mmu.pgt) return false; return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); /* * TODO: Handle nested_mmu structures here using the reverse mapping in * a later version of patch series. */ } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { u64 size = (range->end - range->start) << PAGE_SHIFT; if (!kvm->arch.mmu.pgt) return false; return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); } phys_addr_t kvm_mmu_get_httbr(void) { return __pa(hyp_pgtable->pgd); } phys_addr_t kvm_get_idmap_vector(void) { return hyp_idmap_vector; } static int kvm_map_idmap_text(void) { unsigned long size = hyp_idmap_end - hyp_idmap_start; int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, PAGE_HYP_EXEC); if (err) kvm_err("Failed to idmap %lx-%lx\n", hyp_idmap_start, hyp_idmap_end); return err; } static void *kvm_hyp_zalloc_page(void *arg) { return (void *)get_zeroed_page(GFP_KERNEL); } static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { .zalloc_page = kvm_hyp_zalloc_page, .get_page = kvm_host_get_page, .put_page = kvm_host_put_page, .phys_to_virt = kvm_host_va, .virt_to_phys = kvm_host_pa, }; int __init kvm_mmu_init(u32 *hyp_va_bits) { int err; u32 idmap_bits; u32 kernel_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); /* * We rely on the linker script to ensure at build time that the HYP * init code does not cross a page boundary. */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); /* * The ID map is always configured for 48 bits of translation, which * may be fewer than the number of VA bits used by the regular kernel * stage 1, when VA_BITS=52. * * At EL2, there is only one TTBR register, and we can't switch between * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom * line: we need to use the extended range with *both* our translation * tables. * * So use the maximum of the idmap VA bits and the regular kernel stage * 1 VA bits to assure that the hypervisor can both ID map its code page * and map any kernel memory. */ idmap_bits = IDMAP_VA_BITS; kernel_bits = vabits_actual; *hyp_va_bits = max(idmap_bits, kernel_bits); kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", kern_hyp_va(PAGE_OFFSET), kern_hyp_va((unsigned long)high_memory - 1)); if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { /* * The idmap page is intersecting with the VA space, * it is not safe to continue further. */ kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); err = -EINVAL; goto out; } hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); if (!hyp_pgtable) { kvm_err("Hyp mode page-table not allocated\n"); err = -ENOMEM; goto out; } err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); if (err) goto out_free_pgtable; err = kvm_map_idmap_text(); if (err) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; __hyp_va_bits = *hyp_va_bits; return 0; out_destroy_pgtable: kvm_pgtable_hyp_destroy(hyp_pgtable); out_free_pgtable: kfree(hyp_pgtable); hyp_pgtable = NULL; out: return err; } void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; /* * At this point memslot has been committed and there is an * allocated dirty_bitmap[], dirty pages will be tracked while the * memory slot is write protected. */ if (log_dirty_pages) { if (change == KVM_MR_DELETE) return; /* * Huge and normal pages are write-protected and split * on either of these two cases: * * 1. with initial-all-set: gradually with CLEAR ioctls, */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) return; /* * or * 2. without initial-all-set: all in one shot when * enabling dirty logging. */ kvm_mmu_wp_memory_region(kvm, new->id); kvm_mmu_split_memory_region(kvm, new->id); } else { /* * Free any leftovers from the eager page splitting cache. Do * this when deleting, moving, disabling dirty logging, or * creating the memslot (a nop). Doing it for deletes makes * sure we don't leak memory, and there's no need to keep the * cache around for any of the other cases. */ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); } } int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { hva_t hva, reg_end; int ret = 0; if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && change != KVM_MR_FLAGS_ONLY) return 0; /* * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) return -EFAULT; hva = new->userspace_addr; reg_end = hva + (new->npages << PAGE_SHIFT); mmap_read_lock(current->mm); /* * A memory region could potentially cover multiple VMAs, and any holes * between them, so iterate over all of them. * * +--------------------------------------------+ * +---------------+----------------+ +----------------+ * | : VMA 1 | VMA 2 | | VMA 3 : | * +---------------+----------------+ +----------------+ * | memory region | * +--------------------------------------------+ */ do { struct vm_area_struct *vma; vma = find_vma_intersection(current->mm, hva, reg_end); if (!vma) break; if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { ret = -EINVAL; break; } if (vma->vm_flags & VM_PFNMAP) { /* IO region dirty page logging not allowed */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { ret = -EINVAL; break; } } hva = min(reg_end, vma->vm_end); } while (hva < reg_end); mmap_read_unlock(current->mm); return ret; } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { } void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { gpa_t gpa = slot->base_gfn << PAGE_SHIFT; phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } /* * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). * * Main problems: * - S/W ops are local to a CPU (not broadcast) * - We have line migration behind our back (speculation) * - System caches don't support S/W at all (damn!) * * In the face of the above, the best we can do is to try and convert * S/W ops to VA ops. Because the guest is not allowed to infer the * S/W to PA mapping, it can only use S/W to nuke the whole cache, * which is a rather good thing for us. * * Also, it is only used when turning caches on/off ("The expected * usage of the cache maintenance instructions that operate by set/way * is associated with the cache maintenance instructions associated * with the powerdown and powerup of caches, if this is required by * the implementation."). * * We use the following policy: * * - If we trap a S/W operation, we enable VM trapping to detect * caches being turned on/off, and do a full clean. * * - We flush the caches on both caches being turned on and off. * * - Once the caches are enabled, we stop trapping VM ops. */ void kvm_set_way_flush(struct kvm_vcpu *vcpu) { unsigned long hcr = *vcpu_hcr(vcpu); /* * If this is the first time we do a S/W operation * (i.e. HCR_TVM not set) flush the whole memory, and set the * VM trapping. * * Otherwise, rely on the VM trapping to wait for the MMU + * Caches to be turned off. At that point, we'll be able to * clean the caches again. */ if (!(hcr & HCR_TVM)) { trace_kvm_set_way_flush(*vcpu_pc(vcpu), vcpu_has_cache_enabled(vcpu)); stage2_flush_vm(vcpu->kvm); *vcpu_hcr(vcpu) = hcr | HCR_TVM; } } void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) { bool now_enabled = vcpu_has_cache_enabled(vcpu); /* * If switching the MMU+caches on, need to invalidate the caches. * If switching it off, need to clean the caches. * Clean + invalidate does the trick always. */ if (now_enabled != was_enabled) stage2_flush_vm(vcpu->kvm); /* Caches are now on, stop trapping VM ops (until a S/W op) */ if (now_enabled) *vcpu_hcr(vcpu) &= ~HCR_TVM; trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); }
172 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILELOCK_H #define _LINUX_FILELOCK_H #include <linux/fs.h> #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 struct file_lock; struct file_lease; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_lock_expirable)(struct file_lock *cfl); void (*lm_expire_lock)(void); }; struct lease_manager_operations { bool (*lm_break)(struct file_lease *); int (*lm_change)(struct file_lease *, int, struct list_head *); void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* * struct file_lock has a union that some filesystems use to track * their own private info. The NFS side of things is defined here: */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock_core { struct file_lock_core *flc_blocker; /* The lock that is blocking us */ struct list_head flc_list; /* link into file_lock_context */ struct hlist_node flc_link; /* node in global lists */ struct list_head flc_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head flc_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t flc_owner; unsigned int flc_flags; unsigned char flc_type; pid_t flc_pid; int flc_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t flc_wait; struct file *flc_file; }; struct file_lock { struct file_lock_core c; loff_t fl_start; loff_t fl_end; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; struct { struct inode *inode; } ceph; } fl_u; } __randomize_layout; struct file_lease { struct file_lock_core c; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */ } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; #ifdef CONFIG_FILE_LOCKING int fcntl_getlk(struct file *, unsigned int, struct flock *); int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); static inline bool lock_is_unlock(struct file_lock *fl) { return fl->c.flc_type == F_UNLCK; } static inline bool lock_is_read(struct file_lock *fl) { return fl->c.flc_type == F_RDLCK; } static inline bool lock_is_write(struct file_lock *fl) { return fl->c.flc_type == F_WRLCK; } static inline void locks_wake_up(struct file_lock *fl) { wake_up(&fl->c.flc_wait); } static inline bool locks_can_async_lock(const struct file_operations *fops) { return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; } /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); void locks_init_lock(struct file_lock *); struct file_lock *locks_alloc_lock(void); void locks_copy_lock(struct file_lock *, struct file_lock *); void locks_copy_conflock(struct file_lock *, struct file_lock *); void locks_remove_posix(struct file *, fl_owner_t); void locks_remove_file(struct file *); void locks_release_private(struct file_lock *); void posix_test_lock(struct file *, struct file_lock *); int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); int locks_delete_block(struct file_lock *); int vfs_test_lock(struct file *, struct file_lock *); int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); int vfs_setlease(struct file *, int, struct file_lease **, void **); int lease_modify(struct file_lease *, int, struct list_head *); struct notifier_block; int lease_register_notifier(struct notifier_block *); void lease_unregister_notifier(struct notifier_block *); struct files_struct; void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return smp_load_acquire(&inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline bool lock_is_unlock(struct file_lock *fl) { return false; } static inline bool lock_is_read(struct file_lock *fl) { return false; } static inline bool lock_is_write(struct file_lock *fl) { return false; } static inline void locks_wake_up(struct file_lock *fl) { } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_init_lease(struct file_lease *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline bool vfs_inode_has_locks(struct inode *inode) { return false; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } static inline int kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { return false; } static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return NULL; } #endif /* !CONFIG_FILE_LOCKING */ /* for walking lists of file_locks linked by fl_list */ #define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(file_inode(filp), fl); } #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { int ret; ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); } return ret; } static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; ret = break_deleg(*delegated_inode, O_WRONLY); iput(*delegated_inode); *delegated_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, FL_LAYOUT); return 0; } #else /* !CONFIG_FILE_LOCKING */ static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { return 0; } static inline int break_deleg_wait(struct inode **delegated_inode) { BUG(); return 0; } static inline int break_layout(struct inode *inode, bool wait) { return 0; } #endif /* CONFIG_FILE_LOCKING */ #endif /* _LINUX_FILELOCK_H */
66 360 42 187 2 68 240 241 240 241 510 167 266 213 24 110 31 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> /* Internal core VMA manipulation functions. */ #include "vma.h" struct folio_batch; /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ __warned = true; \ WARN_ON(1); \ } \ unlikely(__ret_warn_once); \ }) void page_writeback_init(void); /* * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. */ #define ENTIRELY_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) /* * Flags passed to __show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. * * Don't use this function outside of debugging code. */ static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } /* * Retrieve the first entry of a folio based on a provided entry within the * folio. We cannot rely on folio->swap as there is no guarantee that it has * been initialized. Used for calling arch_swap_restore() */ static inline swp_entry_t folio_swap(swp_entry_t entry, const struct folio *folio) { swp_entry_t swap = { .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), }; return swap; } static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } /* * This is a file-backed mapping, and is about to be memory mapped - invoke its * mmap hook and safely handle error conditions. On error, VMA hooks will be * mutated. * * @file: File which backs the mapping. * @vma: VMA which we are mapping. * * Returns: 0 if success, error otherwise. */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { int err = call_mmap(file, vma); if (likely(!err)) return 0; /* * OK, we tried to call the file hook for mmap(), but an error * arose. The mapping is in an inconsistent state and we most not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; return err; } /* * If the VMA has a close hook then close it, and since closing it might leave * it in an inconsistent state which makes the use of any hooks suspect, clear * them down by installing dummy empty hooks. */ static inline void vma_close(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->close) { vma->vm_ops->close(vma); /* * The mapping is in an inconsistent state, and no further hooks * may be invoked upon it. */ vma->vm_ops = &vma_dummy_vm_ops; } } #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ #define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ #define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { if (flags & FPB_IGNORE_DIRTY) pte = pte_mkclean(pte); if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } /** * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @addr: The user virtual address the first page is mapped at. * @start_ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. * @any_young: Optional pointer to indicate whether any entry except the * first one is young. * @any_dirty: Optional pointer to indicate whether any entry except the * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; bool writable, young, dirty; int nr; if (any_writable) *any_writable = false; if (any_young) *any_young = false; if (any_dirty) *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; while (ptep < end_ptep) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); if (any_young) young = !!pte_young(pte); if (any_dirty) dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) break; /* * Stop immediately once we reached the end of the folio. In * corner cases the next PFN might fall into a different * folio. */ if (pte_pfn(pte) >= folio_end_pfn) break; if (any_writable) *any_writable |= writable; if (any_young) *any_young |= young; if (any_dirty) *any_dirty |= dirty; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); ptep += nr; } return min(ptep - start_ptep, max_nr); } /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); if (pte_swp_exclusive(pte)) new = pte_swp_mkexclusive(new); if (pte_swp_uffd_wp(pte)) new = pte_swp_mkuffd_wp(new); return new; } /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_next_swp_offset(pte_t pte) { return pte_move_swp_offset(pte, 1); } /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. * @max_nr: The maximum number of table entries to consider. * @pte: Page table entry for the first entry. * * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs * containing swap entries all with consecutive offsets and targeting the same * swap type, all with matching swp pte bits. * * max_nr must be at least one and must be limited by the caller so scanning * cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); VM_WARN_ON(non_swap_entry(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } return ptep - start_ptep; } #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) { pg_data_t *pgdat = folio_pgdat(folio); int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); if (nr_throttled) __acct_reclaim_writeback(pgdat, folio, nr_throttled); } static inline void wake_throttle_isolated(pg_data_t *pgdat) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; if (waitqueue_active(wqh)) wake_up(wqh); } vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { vm_fault_t ret = __vmf_anon_prepare(vmf); if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vmf->vma); return ret; } vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. * @folio: The folio to test. * * Test whether @folio is evictable -- i.e., should be placed on * active/inactive lists vs unevictable list. * * Reasons folio might not be evictable: * 1. folio's mapping marked unevictable * 2. One of the pages in the folio is part of an mlocked VMA */ static inline bool folio_evictable(struct folio *folio) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(folio_mapping(folio)) && !folio_test_mlocked(folio); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } /* * Return true if a folio needs ->release_folio() calling upon it. */ static inline bool folio_needs_release(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); return folio_has_private(folio) || (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); int __meminit init_per_zone_wmark_min(void); void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we set PageBuddy. * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; if (buddy_order(buddy) != order) return false; /* * zone check is done late to avoid uselessly calculating * zone/node ids for pages that could never merge. */ if (page_zone_id(page) != page_zone_id(buddy)) return false; VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return true; } /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } /* * Find the buddy of @page and validate it. * @page: The input page * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the * function is used in the performance-critical __free_one_page(). * @order: The order of the page * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to * page_to_pfn(). * * The found buddy can be a non PageBuddy, out of @page's zone, or its order is * not the same as @page. The validation is necessary before use it. * * Return: the found buddy page or NULL if not found. */ static inline struct page *find_buddy_page_pfn(struct page *page, unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) { unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy; buddy = page + (__buddy_pfn - pfn); if (buddy_pfn) *buddy_pfn = __buddy_pfn; if (page_is_buddy(page, buddy, order)) return buddy; return NULL; } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } void set_zone_contiguous(struct zone *zone); static inline void clear_zone_contiguous(struct zone *zone) { zone->contiguous = false; } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); /* * This will have no effect, other than possibly generating a warning, if the * caller passes in a non-large folio. */ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef CONFIG_64BIT folio->_folio_nr_pages = 1U << order; #endif } bool __folio_unqueue_deferred_split(struct folio *folio); static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) return false; /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; if (folio && folio_test_large(folio)) folio_set_large_rmappable(folio); return folio; } static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_order(folio, order); atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) { struct page *p = head + tail_idx; p->mapping = TAIL_MAPPING; set_compound_head(p, head); set_page_private(p, 0); } extern void prep_compound_page(struct page *page, unsigned int order); void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, nodemask_t *); #define __alloc_frozen_pages(...) \ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); #ifdef CONFIG_NUMA struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); #else static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) { return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); } #endif #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void zone_pcp_init(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ bool finish_pageblock; /* Scan the remainder of a pageblock. Used * when there are potentially transient * isolation or migration failures to * ensure forward progress. */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } /* mm/util.c */ struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: * Caller knows at least 1 page of folio is associated with page table of VMA * and the range [start, end) is intersect with the VMA range. Caller wants * to know whether the folio is fully associated with the range. It calls * this function to check whether the folio is in the range first. Then checks * the page table to know whether the folio is fully mapped to the range. */ static inline bool folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) return false; if (start < vma->vm_start) start = vma->vm_start; if (end > vma->vm_end) end = vma->vm_end; pgoff = folio_pgoff(folio); /* if folio start address is not in vma range */ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) return false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); return !(addr < start || end - addr < folio_size(folio)); } static inline bool folio_within_vma(struct folio *folio, struct vm_area_struct *vma) { return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); } /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. * 1) VM_IO check prevents migration from double-counting during mlock. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED * is never left set on a VM_SPECIAL vma, there is an interval while * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * munlock if the function is called. Ideally, we should only * do munlock if any page of folio is unmapped from VMA and * cause folio not fully mapped to VMA. * * But it's not easy to confirm that's the situation. So we * always munlock the folio and page reclaim will correct it * if it's wrong. */ if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /** * vma_address - Find the virtual address a page range is mapped at * @vma: The vma which maps this object. * @pgoff: The page offset within its object. * @nr_pages: The number of pages to consider. * * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; /* Common case, plus ->pgoff is invalid for KSM */ if (pvmw->nr_pages == 1) return pvmw->address + PAGE_SIZE; pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); release_fault_lock(vmf); } return fpin; } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* Memory initialisation debug and verification */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif /* * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) { } #endif extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; enum migrate_reason reason; }; /* * mm/filemap.c */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size); /* * mm/vmalloc.c */ #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { } static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); /* * mm/gup.c */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. */ static inline int get_order_from_str(const char *size_str, unsigned long valid_orders) { unsigned long size; char *endptr; int order; size = memparse(size_str, &endptr); if (!is_power_of_2(size)) return -EINVAL; order = get_order(size); if (BIT(order) & ~valid_orders) return -EINVAL; return order; } enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, /* a retry, previous pass started an IO */ FOLL_TRIED = 1 << 17, /* we are working on non-current tsk/mm */ FOLL_REMOTE = 1 << 18, /* pages must be released via unpin_user_page */ FOLL_PIN = 1 << 19, /* gup_fast: prevent fall-back to slow gup */ FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the * GUP pin will remain consistent with the pages mapped into the page tables * of the MM. * * Temporary unmapping of PageAnonExclusive() pages or clearing of * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. * * If the vma is NULL, we're coming from the GUP-fast path and might have * to fallback to the slow path just to lookup the vma. */ static inline bool gup_must_unshare(struct vm_area_struct *vma, unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry * has to be writable -- and if it references (part of) an anonymous * folio, that part is required to be marked exclusive. */ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) return false; /* * Note: PageAnon(page) is stable until the page is actually getting * freed. */ if (!PageAnon(page)) { /* * We only care about R/O long-term pining: R/O short-term * pinning does not have the semantics to observe successive * changes through the process page tables. */ if (!(flags & FOLL_LONGTERM)) return false; /* We really need the vma ... */ if (!vma) return true; /* * ... because we only care about writable private ("COW") * mappings where we have to break COW early. */ return is_cow_mapping(vma->vm_flags); } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* * Note that KSM pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); } extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; } static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty * enablements, because when without soft-dirty being compiled in, * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return false; /* * Soft-dirty is kind of special: its tracking is enabled when the * vma flags not set. */ return !(vma->vm_flags & VM_SOFTDIRTY); } static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd) { return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd); } static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte) { return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); return shrinker->name ? 0 : -ENOMEM; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { kfree_const(shrinker->name); shrinker->name = NULL; } extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list ap) { return 0; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { } static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { *debugfs_id = -1; return NULL; } static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) { } #endif /* CONFIG_SHRINKER_DEBUG */ /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ xas_set_update(xas, workingset_update_node); \ xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks, bool for_stack); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { } #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, pmd_t pmdval); void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, struct mmu_gather *tlb); #ifdef CONFIG_PT_RECLAIM bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details); #else static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details) { return false; } #endif /* CONFIG_PT_RECLAIM */ #endif /* __MM_INTERNAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #ifndef __IPVLAN_H #define __IPVLAN_H #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" #define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) #define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) #define IPVLAN_MAC_FILTER_BITS 8 #define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) #define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) #define IPVLAN_QBACKLOG_LIMIT 1000 typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, IPVL_IPV4, IPVL_ARP, } ipvl_hdr_type; struct ipvl_pcpu_stats { u64_stats_t rx_pkts; u64_stats_t rx_bytes; u64_stats_t rx_mcast; u64_stats_t tx_pkts; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errs; u32 tx_drps; }; struct ipvl_port; struct ipvl_dev { struct net_device *dev; struct list_head pnode; struct ipvl_port *port; struct net_device *phy_dev; struct list_head addrs; struct ipvl_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); netdev_features_t sfeatures; u32 msg_enable; spinlock_t addrs_lock; }; struct ipvl_addr { struct ipvl_dev *master; /* Back pointer to master */ union { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ struct list_head anode; /* logical-interface linkage */ ipvl_hdr_type atype; struct rcu_head rcu; }; struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_head ipvlans; u16 mode; u16 flags; u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; int count; struct ida ida; netdevice_tracker dev_tracker; }; struct ipvl_skb_cb { bool tx_pkt; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) { return rcu_dereference(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d) { return rcu_dereference_bh(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) { return rtnl_dereference(d->rx_handler_data); } static inline bool ipvlan_is_private(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_PRIVATE); } static inline void ipvlan_mark_private(struct ipvl_port *port) { port->flags |= IPVLAN_F_PRIVATE; } static inline void ipvlan_clear_private(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_PRIVATE; } static inline bool ipvlan_is_vepa(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_VEPA); } static inline void ipvlan_mark_vepa(struct ipvl_port *port) { port->flags |= IPVLAN_F_VEPA; } static inline void ipvlan_clear_vepa(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_VEPA; } void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); void ipvlan_process_multicast(struct work_struct *work); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest); void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type); void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast); int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); int ipvlan_link_register(struct rtnl_link_ops *ops); #ifdef CONFIG_IPVLAN_L3S int ipvlan_l3s_register(struct ipvl_port *port); void ipvlan_l3s_unregister(struct ipvl_port *port); void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet); int ipvlan_l3s_init(void); void ipvlan_l3s_cleanup(void); #else static inline int ipvlan_l3s_register(struct ipvl_port *port) { return -ENOTSUPP; } static inline void ipvlan_l3s_unregister(struct ipvl_port *port) { } static inline void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { } static inline int ipvlan_l3s_init(void) { return 0; } static inline void ipvlan_l3s_cleanup(void) { } #endif /* CONFIG_IPVLAN_L3S */ static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame; } #endif /* __IPVLAN_H */
4 4 4 4 3 4 4 2 4 87 54 86 70 47 47 29 70 49 49 57 57 49 49 2 47 57 3 54 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/adjust_pc.h> #include <linux/compiler.h> #include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) static u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: return read_gicreg(ICH_LR0_EL2); case 1: return read_gicreg(ICH_LR1_EL2); case 2: return read_gicreg(ICH_LR2_EL2); case 3: return read_gicreg(ICH_LR3_EL2); case 4: return read_gicreg(ICH_LR4_EL2); case 5: return read_gicreg(ICH_LR5_EL2); case 6: return read_gicreg(ICH_LR6_EL2); case 7: return read_gicreg(ICH_LR7_EL2); case 8: return read_gicreg(ICH_LR8_EL2); case 9: return read_gicreg(ICH_LR9_EL2); case 10: return read_gicreg(ICH_LR10_EL2); case 11: return read_gicreg(ICH_LR11_EL2); case 12: return read_gicreg(ICH_LR12_EL2); case 13: return read_gicreg(ICH_LR13_EL2); case 14: return read_gicreg(ICH_LR14_EL2); case 15: return read_gicreg(ICH_LR15_EL2); } unreachable(); } static void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: write_gicreg(val, ICH_LR0_EL2); break; case 1: write_gicreg(val, ICH_LR1_EL2); break; case 2: write_gicreg(val, ICH_LR2_EL2); break; case 3: write_gicreg(val, ICH_LR3_EL2); break; case 4: write_gicreg(val, ICH_LR4_EL2); break; case 5: write_gicreg(val, ICH_LR5_EL2); break; case 6: write_gicreg(val, ICH_LR6_EL2); break; case 7: write_gicreg(val, ICH_LR7_EL2); break; case 8: write_gicreg(val, ICH_LR8_EL2); break; case 9: write_gicreg(val, ICH_LR9_EL2); break; case 10: write_gicreg(val, ICH_LR10_EL2); break; case 11: write_gicreg(val, ICH_LR11_EL2); break; case 12: write_gicreg(val, ICH_LR12_EL2); break; case 13: write_gicreg(val, ICH_LR13_EL2); break; case 14: write_gicreg(val, ICH_LR14_EL2); break; case 15: write_gicreg(val, ICH_LR15_EL2); break; } } static void __vgic_v3_write_ap0rn(u32 val, int n) { switch (n) { case 0: write_gicreg(val, ICH_AP0R0_EL2); break; case 1: write_gicreg(val, ICH_AP0R1_EL2); break; case 2: write_gicreg(val, ICH_AP0R2_EL2); break; case 3: write_gicreg(val, ICH_AP0R3_EL2); break; } } static void __vgic_v3_write_ap1rn(u32 val, int n) { switch (n) { case 0: write_gicreg(val, ICH_AP1R0_EL2); break; case 1: write_gicreg(val, ICH_AP1R1_EL2); break; case 2: write_gicreg(val, ICH_AP1R2_EL2); break; case 3: write_gicreg(val, ICH_AP1R3_EL2); break; } } static u32 __vgic_v3_read_ap0rn(int n) { u32 val; switch (n) { case 0: val = read_gicreg(ICH_AP0R0_EL2); break; case 1: val = read_gicreg(ICH_AP0R1_EL2); break; case 2: val = read_gicreg(ICH_AP0R2_EL2); break; case 3: val = read_gicreg(ICH_AP0R3_EL2); break; default: unreachable(); } return val; } static u32 __vgic_v3_read_ap1rn(int n) { u32 val; switch (n) { case 0: val = read_gicreg(ICH_AP1R0_EL2); break; case 1: val = read_gicreg(ICH_AP1R1_EL2); break; case 2: val = read_gicreg(ICH_AP1R2_EL2); break; case 3: val = read_gicreg(ICH_AP1R3_EL2); break; default: unreachable(); } return val; } void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; /* * Make sure stores to the GIC via the memory mapped interface * are now visible to the system register interface when reading the * LRs, and when reading back the VMCR on non-VHE systems. */ if (used_lrs || !has_vhe()) { if (!cpu_if->vgic_sre) { dsb(sy); isb(); } } if (used_lrs || cpu_if->its_vpe.its_vm) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; else cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); __gic_v3_set_lr(0, i); } } } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; int i; if (used_lrs || cpu_if->its_vpe.its_vm) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); } /* * Ensure that writes to the LRs, and on non-VHE systems ensure that * the write to the VMCR in __vgic_v3_activate_traps(), will have * reached the (re)distributors. This ensure the guest will read the * correct values from the memory-mapped interface. */ if (used_lrs || !has_vhe()) { if (!cpu_if->vgic_sre) { isb(); dsb(sy); } } } void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) { /* * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a * Group0 interrupt (as generated in GICv2 mode) to be * delivered as a FIQ to the guest, with potentially fatal * consequences. So we must make sure that ICC_SRE_EL1 has * been actually programmed with the value we want before * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). * * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is * provisioned at all. In order to prevent illegal accesses to the * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 * so that the trap bits can take effect. Yes, we *loves* the GIC. */ if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); isb(); } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); if (has_vhe()) { /* * Ensure that the write to the VMCR will have reached * the (re)distributors. This ensure the guest will * read the correct values from the memory-mapped * interface. */ isb(); dsb(sy); } } /* * Prevent the guest from touching the ICC_SRE_EL1 system * register. Note that this may not have any effect, as * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being * injected. Note that this also applies if we don't expect * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; if (!cpu_if->vgic_sre) { cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); } val = read_gicreg(ICC_SRE_EL2); write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); if (!cpu_if->vgic_sre) { /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ isb(); write_gicreg(1, ICC_SRE_EL1); } /* * If we were trapping system registers, we enabled the VGIC even if * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); switch (nr_pre_bits) { case 7: cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); fallthrough; case 6: cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); fallthrough; default: cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); } switch (nr_pre_bits) { case 7: cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); fallthrough; case 6: cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); fallthrough; default: cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); } } static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); switch (nr_pre_bits) { case 7: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); fallthrough; case 6: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); fallthrough; default: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); } switch (nr_pre_bits) { case 7: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); fallthrough; case 6: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); fallthrough; default: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); } } void __vgic_v3_init_lrs(void) { int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); int i; for (i = 0; i <= max_lr_idx; i++) __gic_v3_set_lr(0, i); } /* * Return the GIC CPU configuration: * - [31:0] ICH_VTR_EL2 * - [62:32] RES0 * - [63] MMIO (GICv2) capable */ u64 __vgic_v3_get_gic_config(void) { u64 val, sre = read_gicreg(ICC_SRE_EL1); unsigned long flags = 0; /* * To check whether we have a MMIO-based (GICv2 compatible) * CPU interface, we need to disable the system register * view. To do that safely, we have to prevent any interrupt * from firing (which would be deadly). * * Note that this only makes sense on VHE, as interrupts are * already masked for nVHE as part of the exception entry to * EL2. */ if (has_vhe()) flags = local_daif_save(); /* * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. */ sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); isb(); write_gicreg(0, ICC_SRE_EL1); isb(); val = read_gicreg(ICC_SRE_EL1); write_gicreg(sre, ICC_SRE_EL1); isb(); sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); isb(); if (has_vhe()) local_daif_restore(flags); val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); return val; } static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); } static void __vgic_v3_write_vmcr(u32 vmcr) { write_gicreg(vmcr, ICH_VMCR_EL2); } void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { __vgic_v3_save_aprs(cpu_if); if (cpu_if->vgic_sre) cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); } void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { /* * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen * is dependent on ICC_SRE_EL1.SRE, and we have to perform the * VMCR_EL2 save/restore in the world switch. */ if (cpu_if->vgic_sre) __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); __vgic_v3_restore_aprs(cpu_if); } static int __vgic_v3_bpr_min(void) { /* See Pseudocode for VPriorityGroup */ return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); } static int __vgic_v3_get_group(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; return crm != 8; } #define GICv3_IDLE_PRIORITY 0xff static int __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, u64 *lr_val) { unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; u8 priority = GICv3_IDLE_PRIORITY; int i, lr = -1; for (i = 0; i < used_lrs; i++) { u64 val = __gic_v3_get_lr(i); u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* Not pending in the state? */ if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) continue; /* Group-0 interrupt, but Group-0 disabled? */ if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) continue; /* Group-1 interrupt, but Group-1 disabled? */ if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) continue; /* Not the highest priority? */ if (lr_prio >= priority) continue; /* This is a candidate */ priority = lr_prio; *lr_val = val; lr = i; } if (lr == -1) *lr_val = ICC_IAR1_EL1_SPURIOUS; return lr; } static int __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, int intid, u64 *lr_val) { unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; int i; for (i = 0; i < used_lrs; i++) { u64 val = __gic_v3_get_lr(i); if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && (val & ICH_LR_ACTIVE_BIT)) { *lr_val = val; return i; } } *lr_val = ICC_IAR1_EL1_SPURIOUS; return -1; } static int __vgic_v3_get_highest_active_priority(void) { u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); u32 hap = 0; int i; for (i = 0; i < nr_apr_regs; i++) { u32 val; /* * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers * contain the active priority levels for this VCPU * for the maximum number of supported priority * levels, and we return the full priority level only * if the BPR is programmed to its minimum, otherwise * we return a combination of the priority level and * subpriority, as determined by the setting of the * BPR, but without the full subpriority. */ val = __vgic_v3_read_ap0rn(i); val |= __vgic_v3_read_ap1rn(i); if (!val) { hap += 32; continue; } return (hap + __ffs(val)) << __vgic_v3_bpr_min(); } return GICv3_IDLE_PRIORITY; } static unsigned int __vgic_v3_get_bpr0(u32 vmcr) { return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; } static unsigned int __vgic_v3_get_bpr1(u32 vmcr) { unsigned int bpr; if (vmcr & ICH_VMCR_CBPR_MASK) { bpr = __vgic_v3_get_bpr0(vmcr); if (bpr < 7) bpr++; } else { bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; } return bpr; } /* * Convert a priority to a preemption level, taking the relevant BPR * into account by zeroing the sub-priority bits. */ static u8 __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) { unsigned int bpr; if (!grp) bpr = __vgic_v3_get_bpr0(vmcr) + 1; else bpr = __vgic_v3_get_bpr1(vmcr); return pri & (GENMASK(7, 0) << bpr); } /* * The priority value is independent of any of the BPR values, so we * normalize it using the minimal BPR value. This guarantees that no * matter what the guest does with its BPR, we can always set/get the * same value of a priority. */ static void __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) { u8 pre, ap; u32 val; int apr; pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); ap = pre >> __vgic_v3_bpr_min(); apr = ap / 32; if (!grp) { val = __vgic_v3_read_ap0rn(apr); __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); } else { val = __vgic_v3_read_ap1rn(apr); __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); } } static int __vgic_v3_clear_highest_active_priority(void) { u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); u32 hap = 0; int i; for (i = 0; i < nr_apr_regs; i++) { u32 ap0, ap1; int c0, c1; ap0 = __vgic_v3_read_ap0rn(i); ap1 = __vgic_v3_read_ap1rn(i); if (!ap0 && !ap1) { hap += 32; continue; } c0 = ap0 ? __ffs(ap0) : 32; c1 = ap1 ? __ffs(ap1) : 32; /* Always clear the LSB, which is the highest priority */ if (c0 < c1) { ap0 &= ~BIT(c0); __vgic_v3_write_ap0rn(ap0, i); hap += c0; } else { ap1 &= ~BIT(c1); __vgic_v3_write_ap1rn(ap1, i); hap += c1; } /* Rescale to 8 bits of priority */ return hap << __vgic_v3_bpr_min(); } return GICv3_IDLE_PRIORITY; } static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 lr_val; u8 lr_prio, pmr; int lr, grp; grp = __vgic_v3_get_group(vcpu); lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); if (lr < 0) goto spurious; if (grp != !!(lr_val & ICH_LR_GROUP)) goto spurious; pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; if (pmr <= lr_prio) goto spurious; if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) goto spurious; lr_val &= ~ICH_LR_STATE; lr_val |= ICH_LR_ACTIVE_BIT; __gic_v3_set_lr(lr_val, lr); __vgic_v3_set_active_priority(lr_prio, vmcr, grp); vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); return; spurious: vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); } static void __vgic_v3_clear_active_lr(int lr, u64 lr_val) { lr_val &= ~ICH_LR_ACTIVE_BIT; if (lr_val & ICH_LR_HW) { u32 pid; pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; gic_write_dir(pid); } __gic_v3_set_lr(lr_val, lr); } static void __vgic_v3_bump_eoicount(void) { u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); hcr += 1 << ICH_HCR_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; int lr; /* EOImode == 0, nothing to be done here */ if (!(vmcr & ICH_VMCR_EOIM_MASK)) return; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) return; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { __vgic_v3_bump_eoicount(); return; } __vgic_v3_clear_active_lr(lr, lr_val); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; u8 lr_prio, act_prio; int lr, grp; grp = __vgic_v3_get_group(vcpu); /* Drop priority in any case */ act_prio = __vgic_v3_clear_highest_active_priority(); lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { /* Do not bump EOIcount for LPIs that aren't in the LRs */ if (!(vid >= VGIC_MIN_LPI)) __vgic_v3_bump_eoicount(); return; } /* EOImode == 1 and not an LPI, nothing to be done here */ if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) return; lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* If priorities or group do not match, the guest has fscked-up. */ if (grp != !!(lr_val & ICH_LR_GROUP) || __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) return; /* Let's now perform the deactivation */ __vgic_v3_clear_active_lr(lr, lr_val); } static void __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); } static void __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); } static void __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); if (val & 1) vmcr |= ICH_VMCR_ENG0_MASK; else vmcr &= ~ICH_VMCR_ENG0_MASK; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); if (val & 1) vmcr |= ICH_VMCR_ENG1_MASK; else vmcr &= ~ICH_VMCR_ENG1_MASK; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); } static void __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); } static void __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); u8 bpr_min = __vgic_v3_bpr_min() - 1; /* Enforce BPR limiting */ if (val < bpr_min) val = bpr_min; val <<= ICH_VMCR_BPR0_SHIFT; val &= ICH_VMCR_BPR0_MASK; vmcr &= ~ICH_VMCR_BPR0_MASK; vmcr |= val; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); u8 bpr_min = __vgic_v3_bpr_min(); if (vmcr & ICH_VMCR_CBPR_MASK) return; /* Enforce BPR limiting */ if (val < bpr_min) val = bpr_min; val <<= ICH_VMCR_BPR1_SHIFT; val &= ICH_VMCR_BPR1_MASK; vmcr &= ~ICH_VMCR_BPR1_MASK; vmcr |= val; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) { u32 val; if (!__vgic_v3_get_group(vcpu)) val = __vgic_v3_read_ap0rn(n); else val = __vgic_v3_read_ap1rn(n); vcpu_set_reg(vcpu, rt, val); } static void __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) { u32 val = vcpu_get_reg(vcpu, rt); if (!__vgic_v3_get_group(vcpu)) __vgic_v3_write_ap0rn(val, n); else __vgic_v3_write_ap1rn(val, n); } static void __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 0); } static void __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 1); } static void __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 2); } static void __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 3); } static void __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 0); } static void __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 1); } static void __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 2); } static void __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 3); } static void __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 lr_val; int lr, lr_grp, grp; grp = __vgic_v3_get_group(vcpu); lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); if (lr == -1) goto spurious; lr_grp = !!(lr_val & ICH_LR_GROUP); if (lr_grp != grp) lr_val = ICC_IAR1_EL1_SPURIOUS; spurious: vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); } static void __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vmcr &= ICH_VMCR_PMR_MASK; vmcr >>= ICH_VMCR_PMR_SHIFT; vcpu_set_reg(vcpu, rt, vmcr); } static void __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); val <<= ICH_VMCR_PMR_SHIFT; val &= ICH_VMCR_PMR_MASK; vmcr &= ~ICH_VMCR_PMR_MASK; vmcr |= val; write_gicreg(vmcr, ICH_VMCR_EL2); } static void __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = __vgic_v3_get_highest_active_priority(); vcpu_set_reg(vcpu, rt, val); } static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vtr, val; vtr = read_gicreg(ICH_VTR_EL2); /* PRIbits */ val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; /* CBPR */ val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; vcpu_set_reg(vcpu, rt, val); } static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); if (val & ICC_CTLR_EL1_CBPR_MASK) vmcr |= ICH_VMCR_CBPR_MASK; else vmcr &= ~ICH_VMCR_CBPR_MASK; if (val & ICC_CTLR_EL1_EOImode_MASK) vmcr |= ICH_VMCR_EOIM_MASK; else vmcr &= ~ICH_VMCR_EOIM_MASK; write_gicreg(vmcr, ICH_VMCR_EL2); } static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read) { u64 ich_hcr; if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) return false; ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); switch (sysreg) { case SYS_ICC_IGRPEN0_EL1: if (is_read && (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; case SYS_ICC_AP0Rn_EL1(0): case SYS_ICC_AP0Rn_EL1(1): case SYS_ICC_AP0Rn_EL1(2): case SYS_ICC_AP0Rn_EL1(3): case SYS_ICC_BPR0_EL1: case SYS_ICC_EOIR0_EL1: case SYS_ICC_HPPIR0_EL1: case SYS_ICC_IAR0_EL1: return ich_hcr & ICH_HCR_TALL0; case SYS_ICC_IGRPEN1_EL1: if (is_read && (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; case SYS_ICC_AP1Rn_EL1(0): case SYS_ICC_AP1Rn_EL1(1): case SYS_ICC_AP1Rn_EL1(2): case SYS_ICC_AP1Rn_EL1(3): case SYS_ICC_BPR1_EL1: case SYS_ICC_EOIR1_EL1: case SYS_ICC_HPPIR1_EL1: case SYS_ICC_IAR1_EL1: return ich_hcr & ICH_HCR_TALL1; case SYS_ICC_DIR_EL1: if (ich_hcr & ICH_HCR_TDIR) return true; fallthrough; case SYS_ICC_RPR_EL1: case SYS_ICC_CTLR_EL1: case SYS_ICC_PMR_EL1: return ich_hcr & ICH_HCR_TC; default: return false; } } int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; u64 esr; u32 vmcr; void (*fn)(struct kvm_vcpu *, u32, int); bool is_read; u32 sysreg; if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) return 0; esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { __kvm_skip_instr(vcpu); return 1; } sysreg = esr_cp15_to_sysreg(esr); } else { sysreg = esr_sys64_to_sysreg(esr); } is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) return 0; switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: if (unlikely(!is_read)) return 0; fn = __vgic_v3_read_iar; break; case SYS_ICC_EOIR0_EL1: case SYS_ICC_EOIR1_EL1: if (unlikely(is_read)) return 0; fn = __vgic_v3_write_eoir; break; case SYS_ICC_IGRPEN1_EL1: if (is_read) fn = __vgic_v3_read_igrpen1; else fn = __vgic_v3_write_igrpen1; break; case SYS_ICC_BPR1_EL1: if (is_read) fn = __vgic_v3_read_bpr1; else fn = __vgic_v3_write_bpr1; break; case SYS_ICC_AP0Rn_EL1(0): case SYS_ICC_AP1Rn_EL1(0): if (is_read) fn = __vgic_v3_read_apxr0; else fn = __vgic_v3_write_apxr0; break; case SYS_ICC_AP0Rn_EL1(1): case SYS_ICC_AP1Rn_EL1(1): if (is_read) fn = __vgic_v3_read_apxr1; else fn = __vgic_v3_write_apxr1; break; case SYS_ICC_AP0Rn_EL1(2): case SYS_ICC_AP1Rn_EL1(2): if (is_read) fn = __vgic_v3_read_apxr2; else fn = __vgic_v3_write_apxr2; break; case SYS_ICC_AP0Rn_EL1(3): case SYS_ICC_AP1Rn_EL1(3): if (is_read) fn = __vgic_v3_read_apxr3; else fn = __vgic_v3_write_apxr3; break; case SYS_ICC_HPPIR0_EL1: case SYS_ICC_HPPIR1_EL1: if (unlikely(!is_read)) return 0; fn = __vgic_v3_read_hppir; break; case SYS_ICC_IGRPEN0_EL1: if (is_read) fn = __vgic_v3_read_igrpen0; else fn = __vgic_v3_write_igrpen0; break; case SYS_ICC_BPR0_EL1: if (is_read) fn = __vgic_v3_read_bpr0; else fn = __vgic_v3_write_bpr0; break; case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: if (unlikely(!is_read)) return 0; fn = __vgic_v3_read_rpr; break; case SYS_ICC_CTLR_EL1: if (is_read) fn = __vgic_v3_read_ctlr; else fn = __vgic_v3_write_ctlr; break; case SYS_ICC_PMR_EL1: if (is_read) fn = __vgic_v3_read_pmr; else fn = __vgic_v3_write_pmr; break; default: return 0; } vmcr = __vgic_v3_read_vmcr(); rt = kvm_vcpu_sys_get_rt(vcpu); fn(vcpu, vmcr, rt); __kvm_skip_instr(vcpu); return 1; }
21 21 21 21 21 20 21 21 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 // SPDX-License-Identifier: GPL-2.0+ /* * Driver for AMBA serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright 1999 ARM Limited * Copyright (C) 2000 Deep Blue Solutions Ltd. * Copyright (C) 2010 ST-Ericsson SA * * This is a generic driver for ARM AMBA-type serial ports. They * have a lot of 16550-like features, but are not register compatible. * Note that although they do have CTS, DCD and DSR inputs, they do * not have an RI input, nor do they have DTR or RTS outputs. If * required, these have to be supplied via some other means (eg, GPIO) * and hooked into this driver. */ #include <linux/module.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/console.h> #include <linux/platform_device.h> #include <linux/sysrq.h> #include <linux/device.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/serial_core.h> #include <linux/serial.h> #include <linux/amba/bus.h> #include <linux/amba/serial.h> #include <linux/clk.h> #include <linux/slab.h> #include <linux/dmaengine.h> #include <linux/dma-mapping.h> #include <linux/scatterlist.h> #include <linux/delay.h> #include <linux/types.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> #include <linux/sizes.h> #include <linux/io.h> #include <linux/acpi.h> #define UART_NR 14 #define SERIAL_AMBA_MAJOR 204 #define SERIAL_AMBA_MINOR 64 #define SERIAL_AMBA_NR UART_NR #define AMBA_ISR_PASS_LIMIT 256 #define UART_DR_ERROR (UART011_DR_OE | UART011_DR_BE | UART011_DR_PE | UART011_DR_FE) #define UART_DUMMY_DR_RX BIT(16) enum { REG_DR, REG_ST_DMAWM, REG_ST_TIMEOUT, REG_FR, REG_LCRH_RX, REG_LCRH_TX, REG_IBRD, REG_FBRD, REG_CR, REG_IFLS, REG_IMSC, REG_RIS, REG_MIS, REG_ICR, REG_DMACR, REG_ST_XFCR, REG_ST_XON1, REG_ST_XON2, REG_ST_XOFF1, REG_ST_XOFF2, REG_ST_ITCR, REG_ST_ITIP, REG_ST_ABCR, REG_ST_ABIMSC, /* The size of the array - must be last */ REG_ARRAY_SIZE, }; static u16 pl011_std_offsets[REG_ARRAY_SIZE] = { [REG_DR] = UART01x_DR, [REG_FR] = UART01x_FR, [REG_LCRH_RX] = UART011_LCRH, [REG_LCRH_TX] = UART011_LCRH, [REG_IBRD] = UART011_IBRD, [REG_FBRD] = UART011_FBRD, [REG_CR] = UART011_CR, [REG_IFLS] = UART011_IFLS, [REG_IMSC] = UART011_IMSC, [REG_RIS] = UART011_RIS, [REG_MIS] = UART011_MIS, [REG_ICR] = UART011_ICR, [REG_DMACR] = UART011_DMACR, }; /* There is by now at least one vendor with differing details, so handle it */ struct vendor_data { const u16 *reg_offset; unsigned int ifls; unsigned int fr_busy; unsigned int fr_dsr; unsigned int fr_cts; unsigned int fr_ri; unsigned int inv_fr; bool access_32b; bool oversampling; bool dma_threshold; bool cts_event_workaround; bool always_enabled; bool fixed_options; unsigned int (*get_fifosize)(struct amba_device *dev); }; static unsigned int get_fifosize_arm(struct amba_device *dev) { return amba_rev(dev) < 3 ? 16 : 32; } static struct vendor_data vendor_arm = { .reg_offset = pl011_std_offsets, .ifls = UART011_IFLS_RX4_8 | UART011_IFLS_TX4_8, .fr_busy = UART01x_FR_BUSY, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .oversampling = false, .dma_threshold = false, .cts_event_workaround = false, .always_enabled = false, .fixed_options = false, .get_fifosize = get_fifosize_arm, }; static const struct vendor_data vendor_sbsa = { .reg_offset = pl011_std_offsets, .fr_busy = UART01x_FR_BUSY, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .access_32b = true, .oversampling = false, .dma_threshold = false, .cts_event_workaround = false, .always_enabled = true, .fixed_options = true, }; #ifdef CONFIG_ACPI_SPCR_TABLE static const struct vendor_data vendor_qdt_qdf2400_e44 = { .reg_offset = pl011_std_offsets, .fr_busy = UART011_FR_TXFE, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .inv_fr = UART011_FR_TXFE, .access_32b = true, .oversampling = false, .dma_threshold = false, .cts_event_workaround = false, .always_enabled = true, .fixed_options = true, }; #endif static u16 pl011_st_offsets[REG_ARRAY_SIZE] = { [REG_DR] = UART01x_DR, [REG_ST_DMAWM] = ST_UART011_DMAWM, [REG_ST_TIMEOUT] = ST_UART011_TIMEOUT, [REG_FR] = UART01x_FR, [REG_LCRH_RX] = ST_UART011_LCRH_RX, [REG_LCRH_TX] = ST_UART011_LCRH_TX, [REG_IBRD] = UART011_IBRD, [REG_FBRD] = UART011_FBRD, [REG_CR] = UART011_CR, [REG_IFLS] = UART011_IFLS, [REG_IMSC] = UART011_IMSC, [REG_RIS] = UART011_RIS, [REG_MIS] = UART011_MIS, [REG_ICR] = UART011_ICR, [REG_DMACR] = UART011_DMACR, [REG_ST_XFCR] = ST_UART011_XFCR, [REG_ST_XON1] = ST_UART011_XON1, [REG_ST_XON2] = ST_UART011_XON2, [REG_ST_XOFF1] = ST_UART011_XOFF1, [REG_ST_XOFF2] = ST_UART011_XOFF2, [REG_ST_ITCR] = ST_UART011_ITCR, [REG_ST_ITIP] = ST_UART011_ITIP, [REG_ST_ABCR] = ST_UART011_ABCR, [REG_ST_ABIMSC] = ST_UART011_ABIMSC, }; static unsigned int get_fifosize_st(struct amba_device *dev) { return 64; } static struct vendor_data vendor_st = { .reg_offset = pl011_st_offsets, .ifls = UART011_IFLS_RX_HALF | UART011_IFLS_TX_HALF, .fr_busy = UART01x_FR_BUSY, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .oversampling = true, .dma_threshold = true, .cts_event_workaround = true, .always_enabled = false, .fixed_options = false, .get_fifosize = get_fifosize_st, }; /* Deals with DMA transactions */ struct pl011_dmabuf { dma_addr_t dma; size_t len; char *buf; }; struct pl011_dmarx_data { struct dma_chan *chan; struct completion complete; bool use_buf_b; struct pl011_dmabuf dbuf_a; struct pl011_dmabuf dbuf_b; dma_cookie_t cookie; bool running; struct timer_list timer; unsigned int last_residue; unsigned long last_jiffies; bool auto_poll_rate; unsigned int poll_rate; unsigned int poll_timeout; }; struct pl011_dmatx_data { struct dma_chan *chan; dma_addr_t dma; size_t len; char *buf; bool queued; }; enum pl011_rs485_tx_state { OFF, WAIT_AFTER_RTS, SEND, WAIT_AFTER_SEND, }; /* * We wrap our port structure around the generic uart_port. */ struct uart_amba_port { struct uart_port port; const u16 *reg_offset; struct clk *clk; const struct vendor_data *vendor; unsigned int im; /* interrupt mask */ unsigned int old_status; unsigned int fifosize; /* vendor-specific */ unsigned int fixed_baud; /* vendor-set fixed baud rate */ char type[12]; ktime_t rs485_tx_drain_interval; /* nano */ enum pl011_rs485_tx_state rs485_tx_state; struct hrtimer trigger_start_tx; struct hrtimer trigger_stop_tx; #ifdef CONFIG_DMA_ENGINE /* DMA stuff */ unsigned int dmacr; /* dma control reg */ bool using_tx_dma; bool using_rx_dma; struct pl011_dmarx_data dmarx; struct pl011_dmatx_data dmatx; bool dma_probed; #endif }; static unsigned int pl011_tx_empty(struct uart_port *port); static unsigned int pl011_reg_to_offset(const struct uart_amba_port *uap, unsigned int reg) { return uap->reg_offset[reg]; } static unsigned int pl011_read(const struct uart_amba_port *uap, unsigned int reg) { void __iomem *addr = uap->port.membase + pl011_reg_to_offset(uap, reg); return (uap->port.iotype == UPIO_MEM32) ? readl_relaxed(addr) : readw_relaxed(addr); } static void pl011_write(unsigned int val, const struct uart_amba_port *uap, unsigned int reg) { void __iomem *addr = uap->port.membase + pl011_reg_to_offset(uap, reg); if (uap->port.iotype == UPIO_MEM32) writel_relaxed(val, addr); else writew_relaxed(val, addr); } /* * Reads up to 256 characters from the FIFO or until it's empty and * inserts them into the TTY layer. Returns the number of characters * read from the FIFO. */ static int pl011_fifo_to_tty(struct uart_amba_port *uap) { unsigned int ch, fifotaken; int sysrq; u16 status; u8 flag; for (fifotaken = 0; fifotaken != 256; fifotaken++) { status = pl011_read(uap, REG_FR); if (status & UART01x_FR_RXFE) break; /* Take chars from the FIFO and update status */ ch = pl011_read(uap, REG_DR) | UART_DUMMY_DR_RX; flag = TTY_NORMAL; uap->port.icount.rx++; if (unlikely(ch & UART_DR_ERROR)) { if (ch & UART011_DR_BE) { ch &= ~(UART011_DR_FE | UART011_DR_PE); uap->port.icount.brk++; if (uart_handle_break(&uap->port)) continue; } else if (ch & UART011_DR_PE) { uap->port.icount.parity++; } else if (ch & UART011_DR_FE) { uap->port.icount.frame++; } if (ch & UART011_DR_OE) uap->port.icount.overrun++; ch &= uap->port.read_status_mask; if (ch & UART011_DR_BE) flag = TTY_BREAK; else if (ch & UART011_DR_PE) flag = TTY_PARITY; else if (ch & UART011_DR_FE) flag = TTY_FRAME; } sysrq = uart_prepare_sysrq_char(&uap->port, ch & 255); if (!sysrq) uart_insert_char(&uap->port, ch, UART011_DR_OE, ch, flag); } return fifotaken; } /* * All the DMA operation mode stuff goes inside this ifdef. * This assumes that you have a generic DMA device interface, * no custom DMA interfaces are supported. */ #ifdef CONFIG_DMA_ENGINE #define PL011_DMA_BUFFER_SIZE PAGE_SIZE static int pl011_dmabuf_init(struct dma_chan *chan, struct pl011_dmabuf *db, enum dma_data_direction dir) { db->buf = dma_alloc_coherent(chan->device->dev, PL011_DMA_BUFFER_SIZE, &db->dma, GFP_KERNEL); if (!db->buf) return -ENOMEM; db->len = PL011_DMA_BUFFER_SIZE; return 0; } static void pl011_dmabuf_free(struct dma_chan *chan, struct pl011_dmabuf *db, enum dma_data_direction dir) { if (db->buf) { dma_free_coherent(chan->device->dev, PL011_DMA_BUFFER_SIZE, db->buf, db->dma); } } static void pl011_dma_probe(struct uart_amba_port *uap) { /* DMA is the sole user of the platform data right now */ struct amba_pl011_data *plat = dev_get_platdata(uap->port.dev); struct device *dev = uap->port.dev; struct dma_slave_config tx_conf = { .dst_addr = uap->port.mapbase + pl011_reg_to_offset(uap, REG_DR), .dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE, .direction = DMA_MEM_TO_DEV, .dst_maxburst = uap->fifosize >> 1, .device_fc = false, }; struct dma_chan *chan; dma_cap_mask_t mask; uap->dma_probed = true; chan = dma_request_chan(dev, "tx"); if (IS_ERR(chan)) { if (PTR_ERR(chan) == -EPROBE_DEFER) { uap->dma_probed = false; return; } /* We need platform data */ if (!plat || !plat->dma_filter) { dev_dbg(uap->port.dev, "no DMA platform data\n"); return; } /* Try to acquire a generic DMA engine slave TX channel */ dma_cap_zero(mask); dma_cap_set(DMA_SLAVE, mask); chan = dma_request_channel(mask, plat->dma_filter, plat->dma_tx_param); if (!chan) { dev_err(uap->port.dev, "no TX DMA channel!\n"); return; } } dmaengine_slave_config(chan, &tx_conf); uap->dmatx.chan = chan; dev_info(uap->port.dev, "DMA channel TX %s\n", dma_chan_name(uap->dmatx.chan)); /* Optionally make use of an RX channel as well */ chan = dma_request_chan(dev, "rx"); if (IS_ERR(chan) && plat && plat->dma_rx_param) { chan = dma_request_channel(mask, plat->dma_filter, plat->dma_rx_param); if (!chan) { dev_err(uap->port.dev, "no RX DMA channel!\n"); return; } } if (!IS_ERR(chan)) { struct dma_slave_config rx_conf = { .src_addr = uap->port.mapbase + pl011_reg_to_offset(uap, REG_DR), .src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE, .direction = DMA_DEV_TO_MEM, .src_maxburst = uap->fifosize >> 2, .device_fc = false, }; struct dma_slave_caps caps; /* * Some DMA controllers provide information on their capabilities. * If the controller does, check for suitable residue processing * otherwise assime all is well. */ if (dma_get_slave_caps(chan, &caps) == 0) { if (caps.residue_granularity == DMA_RESIDUE_GRANULARITY_DESCRIPTOR) { dma_release_channel(chan); dev_info(uap->port.dev, "RX DMA disabled - no residue processing\n"); return; } } dmaengine_slave_config(chan, &rx_conf); uap->dmarx.chan = chan; uap->dmarx.auto_poll_rate = false; if (plat && plat->dma_rx_poll_enable) { /* Set poll rate if specified. */ if (plat->dma_rx_poll_rate) { uap->dmarx.auto_poll_rate = false; uap->dmarx.poll_rate = plat->dma_rx_poll_rate; } else { /* * 100 ms defaults to poll rate if not * specified. This will be adjusted with * the baud rate at set_termios. */ uap->dmarx.auto_poll_rate = true; uap->dmarx.poll_rate = 100; } /* 3 secs defaults poll_timeout if not specified. */ if (plat->dma_rx_poll_timeout) uap->dmarx.poll_timeout = plat->dma_rx_poll_timeout; else uap->dmarx.poll_timeout = 3000; } else if (!plat && dev->of_node) { uap->dmarx.auto_poll_rate = of_property_read_bool(dev->of_node, "auto-poll"); if (uap->dmarx.auto_poll_rate) { u32 x; if (of_property_read_u32(dev->of_node, "poll-rate-ms", &x) == 0) uap->dmarx.poll_rate = x; else uap->dmarx.poll_rate = 100; if (of_property_read_u32(dev->of_node, "poll-timeout-ms", &x) == 0) uap->dmarx.poll_timeout = x; else uap->dmarx.poll_timeout = 3000; } } dev_info(uap->port.dev, "DMA channel RX %s\n", dma_chan_name(uap->dmarx.chan)); } } static void pl011_dma_remove(struct uart_amba_port *uap) { if (uap->dmatx.chan) dma_release_channel(uap->dmatx.chan); if (uap->dmarx.chan) dma_release_channel(uap->dmarx.chan); } /* Forward declare these for the refill routine */ static int pl011_dma_tx_refill(struct uart_amba_port *uap); static void pl011_start_tx_pio(struct uart_amba_port *uap); /* * The current DMA TX buffer has been sent. * Try to queue up another DMA buffer. */ static void pl011_dma_tx_callback(void *data) { struct uart_amba_port *uap = data; struct tty_port *tport = &uap->port.state->port; struct pl011_dmatx_data *dmatx = &uap->dmatx; unsigned long flags; u16 dmacr; uart_port_lock_irqsave(&uap->port, &flags); if (uap->dmatx.queued) dma_unmap_single(dmatx->chan->device->dev, dmatx->dma, dmatx->len, DMA_TO_DEVICE); dmacr = uap->dmacr; uap->dmacr = dmacr & ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); /* * If TX DMA was disabled, it means that we've stopped the DMA for * some reason (eg, XOFF received, or we want to send an X-char.) * * Note: we need to be careful here of a potential race between DMA * and the rest of the driver - if the driver disables TX DMA while * a TX buffer completing, we must update the tx queued status to * get further refills (hence we check dmacr). */ if (!(dmacr & UART011_TXDMAE) || uart_tx_stopped(&uap->port) || kfifo_is_empty(&tport->xmit_fifo)) { uap->dmatx.queued = false; uart_port_unlock_irqrestore(&uap->port, flags); return; } if (pl011_dma_tx_refill(uap) <= 0) /* * We didn't queue a DMA buffer for some reason, but we * have data pending to be sent. Re-enable the TX IRQ. */ pl011_start_tx_pio(uap); uart_port_unlock_irqrestore(&uap->port, flags); } /* * Try to refill the TX DMA buffer. * Locking: called with port lock held and IRQs disabled. * Returns: * 1 if we queued up a TX DMA buffer. * 0 if we didn't want to handle this by DMA * <0 on error */ static int pl011_dma_tx_refill(struct uart_amba_port *uap) { struct pl011_dmatx_data *dmatx = &uap->dmatx; struct dma_chan *chan = dmatx->chan; struct dma_device *dma_dev = chan->device; struct dma_async_tx_descriptor *desc; struct tty_port *tport = &uap->port.state->port; unsigned int count; /* * Try to avoid the overhead involved in using DMA if the * transaction fits in the first half of the FIFO, by using * the standard interrupt handling. This ensures that we * issue a uart_write_wakeup() at the appropriate time. */ count = kfifo_len(&tport->xmit_fifo); if (count < (uap->fifosize >> 1)) { uap->dmatx.queued = false; return 0; } /* * Bodge: don't send the last character by DMA, as this * will prevent XON from notifying us to restart DMA. */ count -= 1; /* Else proceed to copy the TX chars to the DMA buffer and fire DMA */ if (count > PL011_DMA_BUFFER_SIZE) count = PL011_DMA_BUFFER_SIZE; count = kfifo_out_peek(&tport->xmit_fifo, dmatx->buf, count); dmatx->len = count; dmatx->dma = dma_map_single(dma_dev->dev, dmatx->buf, count, DMA_TO_DEVICE); if (dmatx->dma == DMA_MAPPING_ERROR) { uap->dmatx.queued = false; dev_dbg(uap->port.dev, "unable to map TX DMA\n"); return -EBUSY; } desc = dmaengine_prep_slave_single(chan, dmatx->dma, dmatx->len, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dma_unmap_single(dma_dev->dev, dmatx->dma, dmatx->len, DMA_TO_DEVICE); uap->dmatx.queued = false; /* * If DMA cannot be used right now, we complete this * transaction via IRQ and let the TTY layer retry. */ dev_dbg(uap->port.dev, "TX DMA busy\n"); return -EBUSY; } /* Some data to go along to the callback */ desc->callback = pl011_dma_tx_callback; desc->callback_param = uap; /* All errors should happen at prepare time */ dmaengine_submit(desc); /* Fire the DMA transaction */ dma_dev->device_issue_pending(chan); uap->dmacr |= UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->dmatx.queued = true; /* * Now we know that DMA will fire, so advance the ring buffer * with the stuff we just dispatched. */ uart_xmit_advance(&uap->port, count); if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) uart_write_wakeup(&uap->port); return 1; } /* * We received a transmit interrupt without a pending X-char but with * pending characters. * Locking: called with port lock held and IRQs disabled. * Returns: * false if we want to use PIO to transmit * true if we queued a DMA buffer */ static bool pl011_dma_tx_irq(struct uart_amba_port *uap) { if (!uap->using_tx_dma) return false; /* * If we already have a TX buffer queued, but received a * TX interrupt, it will be because we've just sent an X-char. * Ensure the TX DMA is enabled and the TX IRQ is disabled. */ if (uap->dmatx.queued) { uap->dmacr |= UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); return true; } /* * We don't have a TX buffer queued, so try to queue one. * If we successfully queued a buffer, mask the TX IRQ. */ if (pl011_dma_tx_refill(uap) > 0) { uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); return true; } return false; } /* * Stop the DMA transmit (eg, due to received XOFF). * Locking: called with port lock held and IRQs disabled. */ static inline void pl011_dma_tx_stop(struct uart_amba_port *uap) { if (uap->dmatx.queued) { uap->dmacr &= ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } } /* * Try to start a DMA transmit, or in the case of an XON/OFF * character queued for send, try to get that character out ASAP. * Locking: called with port lock held and IRQs disabled. * Returns: * false if we want the TX IRQ to be enabled * true if we have a buffer queued */ static inline bool pl011_dma_tx_start(struct uart_amba_port *uap) { u16 dmacr; if (!uap->using_tx_dma) return false; if (!uap->port.x_char) { /* no X-char, try to push chars out in DMA mode */ bool ret = true; if (!uap->dmatx.queued) { if (pl011_dma_tx_refill(uap) > 0) { uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); } else { ret = false; } } else if (!(uap->dmacr & UART011_TXDMAE)) { uap->dmacr |= UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } return ret; } /* * We have an X-char to send. Disable DMA to prevent it loading * the TX fifo, and then see if we can stuff it into the FIFO. */ dmacr = uap->dmacr; uap->dmacr &= ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); if (pl011_read(uap, REG_FR) & UART01x_FR_TXFF) { /* * No space in the FIFO, so enable the transmit interrupt * so we know when there is space. Note that once we've * loaded the character, we should just re-enable DMA. */ return false; } pl011_write(uap->port.x_char, uap, REG_DR); uap->port.icount.tx++; uap->port.x_char = 0; /* Success - restore the DMA state */ uap->dmacr = dmacr; pl011_write(dmacr, uap, REG_DMACR); return true; } /* * Flush the transmit buffer. * Locking: called with port lock held and IRQs disabled. */ static void pl011_dma_flush_buffer(struct uart_port *port) __releases(&uap->port.lock) __acquires(&uap->port.lock) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if (!uap->using_tx_dma) return; dmaengine_terminate_async(uap->dmatx.chan); if (uap->dmatx.queued) { dma_unmap_single(uap->dmatx.chan->device->dev, uap->dmatx.dma, uap->dmatx.len, DMA_TO_DEVICE); uap->dmatx.queued = false; uap->dmacr &= ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } } static void pl011_dma_rx_callback(void *data); static int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap) { struct dma_chan *rxchan = uap->dmarx.chan; struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_async_tx_descriptor *desc; struct pl011_dmabuf *dbuf; if (!rxchan) return -EIO; /* Start the RX DMA job */ dbuf = uap->dmarx.use_buf_b ? &uap->dmarx.dbuf_b : &uap->dmarx.dbuf_a; desc = dmaengine_prep_slave_single(rxchan, dbuf->dma, dbuf->len, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); /* * If the DMA engine is busy and cannot prepare a * channel, no big deal, the driver will fall back * to interrupt mode as a result of this error code. */ if (!desc) { uap->dmarx.running = false; dmaengine_terminate_all(rxchan); return -EBUSY; } /* Some data to go along to the callback */ desc->callback = pl011_dma_rx_callback; desc->callback_param = uap; dmarx->cookie = dmaengine_submit(desc); dma_async_issue_pending(rxchan); uap->dmacr |= UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->dmarx.running = true; uap->im &= ~UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); return 0; } /* * This is called when either the DMA job is complete, or * the FIFO timeout interrupt occurred. This must be called * with the port spinlock uap->port.lock held. */ static void pl011_dma_rx_chars(struct uart_amba_port *uap, u32 pending, bool use_buf_b, bool readfifo) { struct tty_port *port = &uap->port.state->port; struct pl011_dmabuf *dbuf = use_buf_b ? &uap->dmarx.dbuf_b : &uap->dmarx.dbuf_a; int dma_count = 0; u32 fifotaken = 0; /* only used for vdbg() */ struct pl011_dmarx_data *dmarx = &uap->dmarx; int dmataken = 0; if (uap->dmarx.poll_rate) { /* The data can be taken by polling */ dmataken = dbuf->len - dmarx->last_residue; /* Recalculate the pending size */ if (pending >= dmataken) pending -= dmataken; } /* Pick the remain data from the DMA */ if (pending) { /* * First take all chars in the DMA pipe, then look in the FIFO. * Note that tty_insert_flip_buf() tries to take as many chars * as it can. */ dma_count = tty_insert_flip_string(port, dbuf->buf + dmataken, pending); uap->port.icount.rx += dma_count; if (dma_count < pending) dev_warn(uap->port.dev, "couldn't insert all characters (TTY is full?)\n"); } /* Reset the last_residue for Rx DMA poll */ if (uap->dmarx.poll_rate) dmarx->last_residue = dbuf->len; /* * Only continue with trying to read the FIFO if all DMA chars have * been taken first. */ if (dma_count == pending && readfifo) { /* Clear any error flags */ pl011_write(UART011_OEIS | UART011_BEIS | UART011_PEIS | UART011_FEIS, uap, REG_ICR); /* * If we read all the DMA'd characters, and we had an * incomplete buffer, that could be due to an rx error, or * maybe we just timed out. Read any pending chars and check * the error status. * * Error conditions will only occur in the FIFO, these will * trigger an immediate interrupt and stop the DMA job, so we * will always find the error in the FIFO, never in the DMA * buffer. */ fifotaken = pl011_fifo_to_tty(uap); } dev_vdbg(uap->port.dev, "Took %d chars from DMA buffer and %d chars from the FIFO\n", dma_count, fifotaken); tty_flip_buffer_push(port); } static void pl011_dma_rx_irq(struct uart_amba_port *uap) { struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_chan *rxchan = dmarx->chan; struct pl011_dmabuf *dbuf = dmarx->use_buf_b ? &dmarx->dbuf_b : &dmarx->dbuf_a; size_t pending; struct dma_tx_state state; enum dma_status dmastat; /* * Pause the transfer so we can trust the current counter, * do this before we pause the PL011 block, else we may * overflow the FIFO. */ if (dmaengine_pause(rxchan)) dev_err(uap->port.dev, "unable to pause DMA transfer\n"); dmastat = rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state); if (dmastat != DMA_PAUSED) dev_err(uap->port.dev, "unable to pause DMA transfer\n"); /* Disable RX DMA - incoming data will wait in the FIFO */ uap->dmacr &= ~UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->dmarx.running = false; pending = dbuf->len - state.residue; BUG_ON(pending > PL011_DMA_BUFFER_SIZE); /* Then we terminate the transfer - we now know our residue */ dmaengine_terminate_all(rxchan); /* * This will take the chars we have so far and insert * into the framework. */ pl011_dma_rx_chars(uap, pending, dmarx->use_buf_b, true); /* Switch buffer & re-trigger DMA job */ dmarx->use_buf_b = !dmarx->use_buf_b; if (pl011_dma_rx_trigger_dma(uap)) { dev_dbg(uap->port.dev, "could not retrigger RX DMA job fall back to interrupt mode\n"); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); } } static void pl011_dma_rx_callback(void *data) { struct uart_amba_port *uap = data; struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_chan *rxchan = dmarx->chan; bool lastbuf = dmarx->use_buf_b; struct pl011_dmabuf *dbuf = dmarx->use_buf_b ? &dmarx->dbuf_b : &dmarx->dbuf_a; size_t pending; struct dma_tx_state state; int ret; /* * This completion interrupt occurs typically when the * RX buffer is totally stuffed but no timeout has yet * occurred. When that happens, we just want the RX * routine to flush out the secondary DMA buffer while * we immediately trigger the next DMA job. */ uart_port_lock_irq(&uap->port); /* * Rx data can be taken by the UART interrupts during * the DMA irq handler. So we check the residue here. */ rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state); pending = dbuf->len - state.residue; BUG_ON(pending > PL011_DMA_BUFFER_SIZE); /* Then we terminate the transfer - we now know our residue */ dmaengine_terminate_all(rxchan); uap->dmarx.running = false; dmarx->use_buf_b = !lastbuf; ret = pl011_dma_rx_trigger_dma(uap); pl011_dma_rx_chars(uap, pending, lastbuf, false); uart_unlock_and_check_sysrq(&uap->port); /* * Do this check after we picked the DMA chars so we don't * get some IRQ immediately from RX. */ if (ret) { dev_dbg(uap->port.dev, "could not retrigger RX DMA job fall back to interrupt mode\n"); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); } } /* * Stop accepting received characters, when we're shutting down or * suspending this port. * Locking: called with port lock held and IRQs disabled. */ static inline void pl011_dma_rx_stop(struct uart_amba_port *uap) { if (!uap->using_rx_dma) return; /* FIXME. Just disable the DMA enable */ uap->dmacr &= ~UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } /* * Timer handler for Rx DMA polling. * Every polling, It checks the residue in the dma buffer and transfer * data to the tty. Also, last_residue is updated for the next polling. */ static void pl011_dma_rx_poll(struct timer_list *t) { struct uart_amba_port *uap = from_timer(uap, t, dmarx.timer); struct tty_port *port = &uap->port.state->port; struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_chan *rxchan = uap->dmarx.chan; unsigned long flags; unsigned int dmataken = 0; unsigned int size = 0; struct pl011_dmabuf *dbuf; int dma_count; struct dma_tx_state state; dbuf = dmarx->use_buf_b ? &uap->dmarx.dbuf_b : &uap->dmarx.dbuf_a; rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state); if (likely(state.residue < dmarx->last_residue)) { dmataken = dbuf->len - dmarx->last_residue; size = dmarx->last_residue - state.residue; dma_count = tty_insert_flip_string(port, dbuf->buf + dmataken, size); if (dma_count == size) dmarx->last_residue = state.residue; dmarx->last_jiffies = jiffies; } tty_flip_buffer_push(port); /* * If no data is received in poll_timeout, the driver will fall back * to interrupt mode. We will retrigger DMA at the first interrupt. */ if (jiffies_to_msecs(jiffies - dmarx->last_jiffies) > uap->dmarx.poll_timeout) { uart_port_lock_irqsave(&uap->port, &flags); pl011_dma_rx_stop(uap); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); uart_port_unlock_irqrestore(&uap->port, flags); uap->dmarx.running = false; dmaengine_terminate_all(rxchan); del_timer(&uap->dmarx.timer); } else { mod_timer(&uap->dmarx.timer, jiffies + msecs_to_jiffies(uap->dmarx.poll_rate)); } } static void pl011_dma_startup(struct uart_amba_port *uap) { int ret; if (!uap->dma_probed) pl011_dma_probe(uap); if (!uap->dmatx.chan) return; uap->dmatx.buf = kmalloc(PL011_DMA_BUFFER_SIZE, GFP_KERNEL | __GFP_DMA); if (!uap->dmatx.buf) { uap->port.fifosize = uap->fifosize; return; } uap->dmatx.len = PL011_DMA_BUFFER_SIZE; /* The DMA buffer is now the FIFO the TTY subsystem can use */ uap->port.fifosize = PL011_DMA_BUFFER_SIZE; uap->using_tx_dma = true; if (!uap->dmarx.chan) goto skip_rx; /* Allocate and map DMA RX buffers */ ret = pl011_dmabuf_init(uap->dmarx.chan, &uap->dmarx.dbuf_a, DMA_FROM_DEVICE); if (ret) { dev_err(uap->port.dev, "failed to init DMA %s: %d\n", "RX buffer A", ret); goto skip_rx; } ret = pl011_dmabuf_init(uap->dmarx.chan, &uap->dmarx.dbuf_b, DMA_FROM_DEVICE); if (ret) { dev_err(uap->port.dev, "failed to init DMA %s: %d\n", "RX buffer B", ret); pl011_dmabuf_free(uap->dmarx.chan, &uap->dmarx.dbuf_a, DMA_FROM_DEVICE); goto skip_rx; } uap->using_rx_dma = true; skip_rx: /* Turn on DMA error (RX/TX will be enabled on demand) */ uap->dmacr |= UART011_DMAONERR; pl011_write(uap->dmacr, uap, REG_DMACR); /* * ST Micro variants has some specific dma burst threshold * compensation. Set this to 16 bytes, so burst will only * be issued above/below 16 bytes. */ if (uap->vendor->dma_threshold) pl011_write(ST_UART011_DMAWM_RX_16 | ST_UART011_DMAWM_TX_16, uap, REG_ST_DMAWM); if (uap->using_rx_dma) { if (pl011_dma_rx_trigger_dma(uap)) dev_dbg(uap->port.dev, "could not trigger initial RX DMA job, fall back to interrupt mode\n"); if (uap->dmarx.poll_rate) { timer_setup(&uap->dmarx.timer, pl011_dma_rx_poll, 0); mod_timer(&uap->dmarx.timer, jiffies + msecs_to_jiffies(uap->dmarx.poll_rate)); uap->dmarx.last_residue = PL011_DMA_BUFFER_SIZE; uap->dmarx.last_jiffies = jiffies; } } } static void pl011_dma_shutdown(struct uart_amba_port *uap) { if (!(uap->using_tx_dma || uap->using_rx_dma)) return; /* Disable RX and TX DMA */ while (pl011_read(uap, REG_FR) & uap->vendor->fr_busy) cpu_relax(); uart_port_lock_irq(&uap->port); uap->dmacr &= ~(UART011_DMAONERR | UART011_RXDMAE | UART011_TXDMAE); pl011_write(uap->dmacr, uap, REG_DMACR); uart_port_unlock_irq(&uap->port); if (uap->using_tx_dma) { /* In theory, this should already be done by pl011_dma_flush_buffer */ dmaengine_terminate_all(uap->dmatx.chan); if (uap->dmatx.queued) { dma_unmap_single(uap->dmatx.chan->device->dev, uap->dmatx.dma, uap->dmatx.len, DMA_TO_DEVICE); uap->dmatx.queued = false; } kfree(uap->dmatx.buf); uap->using_tx_dma = false; } if (uap->using_rx_dma) { dmaengine_terminate_all(uap->dmarx.chan); /* Clean up the RX DMA */ pl011_dmabuf_free(uap->dmarx.chan, &uap->dmarx.dbuf_a, DMA_FROM_DEVICE); pl011_dmabuf_free(uap->dmarx.chan, &uap->dmarx.dbuf_b, DMA_FROM_DEVICE); if (uap->dmarx.poll_rate) del_timer_sync(&uap->dmarx.timer); uap->using_rx_dma = false; } } static inline bool pl011_dma_rx_available(struct uart_amba_port *uap) { return uap->using_rx_dma; } static inline bool pl011_dma_rx_running(struct uart_amba_port *uap) { return uap->using_rx_dma && uap->dmarx.running; } #else /* Blank functions if the DMA engine is not available */ static inline void pl011_dma_remove(struct uart_amba_port *uap) { } static inline void pl011_dma_startup(struct uart_amba_port *uap) { } static inline void pl011_dma_shutdown(struct uart_amba_port *uap) { } static inline bool pl011_dma_tx_irq(struct uart_amba_port *uap) { return false; } static inline void pl011_dma_tx_stop(struct uart_amba_port *uap) { } static inline bool pl011_dma_tx_start(struct uart_amba_port *uap) { return false; } static inline void pl011_dma_rx_irq(struct uart_amba_port *uap) { } static inline void pl011_dma_rx_stop(struct uart_amba_port *uap) { } static inline int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap) { return -EIO; } static inline bool pl011_dma_rx_available(struct uart_amba_port *uap) { return false; } static inline bool pl011_dma_rx_running(struct uart_amba_port *uap) { return false; } #define pl011_dma_flush_buffer NULL #endif static void pl011_rs485_tx_stop(struct uart_amba_port *uap) { struct uart_port *port = &uap->port; u32 cr; if (uap->rs485_tx_state == SEND) uap->rs485_tx_state = WAIT_AFTER_SEND; if (uap->rs485_tx_state == WAIT_AFTER_SEND) { /* Schedule hrtimer if tx queue not empty */ if (!pl011_tx_empty(port)) { hrtimer_start(&uap->trigger_stop_tx, uap->rs485_tx_drain_interval, HRTIMER_MODE_REL); return; } if (port->rs485.delay_rts_after_send > 0) { hrtimer_start(&uap->trigger_stop_tx, ms_to_ktime(port->rs485.delay_rts_after_send), HRTIMER_MODE_REL); return; } /* Continue without any delay */ } else if (uap->rs485_tx_state == WAIT_AFTER_RTS) { hrtimer_try_to_cancel(&uap->trigger_start_tx); } cr = pl011_read(uap, REG_CR); if (port->rs485.flags & SER_RS485_RTS_AFTER_SEND) cr &= ~UART011_CR_RTS; else cr |= UART011_CR_RTS; /* Disable the transmitter and reenable the transceiver */ cr &= ~UART011_CR_TXE; cr |= UART011_CR_RXE; pl011_write(cr, uap, REG_CR); uap->rs485_tx_state = OFF; } static void pl011_stop_tx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if (port->rs485.flags & SER_RS485_ENABLED && uap->rs485_tx_state == WAIT_AFTER_RTS) { pl011_rs485_tx_stop(uap); return; } uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); pl011_dma_tx_stop(uap); if (port->rs485.flags & SER_RS485_ENABLED && uap->rs485_tx_state != OFF) pl011_rs485_tx_stop(uap); } static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq); /* Start TX with programmed I/O only (no DMA) */ static void pl011_start_tx_pio(struct uart_amba_port *uap) { if (pl011_tx_chars(uap, false)) { uap->im |= UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); } } static void pl011_rs485_tx_start(struct uart_amba_port *uap) { struct uart_port *port = &uap->port; u32 cr; if (uap->rs485_tx_state == WAIT_AFTER_RTS) { uap->rs485_tx_state = SEND; return; } if (uap->rs485_tx_state == WAIT_AFTER_SEND) { hrtimer_try_to_cancel(&uap->trigger_stop_tx); uap->rs485_tx_state = SEND; return; } /* uap->rs485_tx_state == OFF */ /* Enable transmitter */ cr = pl011_read(uap, REG_CR); cr |= UART011_CR_TXE; /* Disable receiver if half-duplex */ if (!(port->rs485.flags & SER_RS485_RX_DURING_TX)) cr &= ~UART011_CR_RXE; if (port->rs485.flags & SER_RS485_RTS_ON_SEND) cr &= ~UART011_CR_RTS; else cr |= UART011_CR_RTS; pl011_write(cr, uap, REG_CR); if (port->rs485.delay_rts_before_send > 0) { uap->rs485_tx_state = WAIT_AFTER_RTS; hrtimer_start(&uap->trigger_start_tx, ms_to_ktime(port->rs485.delay_rts_before_send), HRTIMER_MODE_REL); } else { uap->rs485_tx_state = SEND; } } static void pl011_start_tx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if ((uap->port.rs485.flags & SER_RS485_ENABLED) && uap->rs485_tx_state != SEND) { pl011_rs485_tx_start(uap); if (uap->rs485_tx_state == WAIT_AFTER_RTS) return; } if (!pl011_dma_tx_start(uap)) pl011_start_tx_pio(uap); } static enum hrtimer_restart pl011_trigger_start_tx(struct hrtimer *t) { struct uart_amba_port *uap = container_of(t, struct uart_amba_port, trigger_start_tx); unsigned long flags; uart_port_lock_irqsave(&uap->port, &flags); if (uap->rs485_tx_state == WAIT_AFTER_RTS) pl011_start_tx(&uap->port); uart_port_unlock_irqrestore(&uap->port, flags); return HRTIMER_NORESTART; } static enum hrtimer_restart pl011_trigger_stop_tx(struct hrtimer *t) { struct uart_amba_port *uap = container_of(t, struct uart_amba_port, trigger_stop_tx); unsigned long flags; uart_port_lock_irqsave(&uap->port, &flags); if (uap->rs485_tx_state == WAIT_AFTER_SEND) pl011_rs485_tx_stop(uap); uart_port_unlock_irqrestore(&uap->port, flags); return HRTIMER_NORESTART; } static void pl011_stop_rx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); uap->im &= ~(UART011_RXIM | UART011_RTIM | UART011_FEIM | UART011_PEIM | UART011_BEIM | UART011_OEIM); pl011_write(uap->im, uap, REG_IMSC); pl011_dma_rx_stop(uap); } static void pl011_throttle_rx(struct uart_port *port) { unsigned long flags; uart_port_lock_irqsave(port, &flags); pl011_stop_rx(port); uart_port_unlock_irqrestore(port, flags); } static void pl011_enable_ms(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); uap->im |= UART011_RIMIM | UART011_CTSMIM | UART011_DCDMIM | UART011_DSRMIM; pl011_write(uap->im, uap, REG_IMSC); } static void pl011_rx_chars(struct uart_amba_port *uap) __releases(&uap->port.lock) __acquires(&uap->port.lock) { pl011_fifo_to_tty(uap); uart_port_unlock(&uap->port); tty_flip_buffer_push(&uap->port.state->port); /* * If we were temporarily out of DMA mode for a while, * attempt to switch back to DMA mode again. */ if (pl011_dma_rx_available(uap)) { if (pl011_dma_rx_trigger_dma(uap)) { dev_dbg(uap->port.dev, "could not trigger RX DMA job fall back to interrupt mode again\n"); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); } else { #ifdef CONFIG_DMA_ENGINE /* Start Rx DMA poll */ if (uap->dmarx.poll_rate) { uap->dmarx.last_jiffies = jiffies; uap->dmarx.last_residue = PL011_DMA_BUFFER_SIZE; mod_timer(&uap->dmarx.timer, jiffies + msecs_to_jiffies(uap->dmarx.poll_rate)); } #endif } } uart_port_lock(&uap->port); } static bool pl011_tx_char(struct uart_amba_port *uap, unsigned char c, bool from_irq) { if (unlikely(!from_irq) && pl011_read(uap, REG_FR) & UART01x_FR_TXFF) return false; /* unable to transmit character */ pl011_write(c, uap, REG_DR); uap->port.icount.tx++; return true; } /* Returns true if tx interrupts have to be (kept) enabled */ static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq) { struct tty_port *tport = &uap->port.state->port; int count = uap->fifosize >> 1; if (uap->port.x_char) { if (!pl011_tx_char(uap, uap->port.x_char, from_irq)) return true; uap->port.x_char = 0; --count; } if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(&uap->port)) { pl011_stop_tx(&uap->port); return false; } /* If we are using DMA mode, try to send some characters. */ if (pl011_dma_tx_irq(uap)) return true; while (1) { unsigned char c; if (likely(from_irq) && count-- == 0) break; if (!kfifo_peek(&tport->xmit_fifo, &c)) break; if (!pl011_tx_char(uap, c, from_irq)) break; kfifo_skip(&tport->xmit_fifo); } if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) uart_write_wakeup(&uap->port); if (kfifo_is_empty(&tport->xmit_fifo)) { pl011_stop_tx(&uap->port); return false; } return true; } static void pl011_modem_status(struct uart_amba_port *uap) { unsigned int status, delta; status = pl011_read(uap, REG_FR) & UART01x_FR_MODEM_ANY; delta = status ^ uap->old_status; uap->old_status = status; if (!delta) return; if (delta & UART01x_FR_DCD) uart_handle_dcd_change(&uap->port, status & UART01x_FR_DCD); if (delta & uap->vendor->fr_dsr) uap->port.icount.dsr++; if (delta & uap->vendor->fr_cts) uart_handle_cts_change(&uap->port, status & uap->vendor->fr_cts); wake_up_interruptible(&uap->port.state->port.delta_msr_wait); } static void check_apply_cts_event_workaround(struct uart_amba_port *uap) { if (!uap->vendor->cts_event_workaround) return; /* workaround to make sure that all bits are unlocked.. */ pl011_write(0x00, uap, REG_ICR); /* * WA: introduce 26ns(1 uart clk) delay before W1C; * single apb access will incur 2 pclk(133.12Mhz) delay, * so add 2 dummy reads */ pl011_read(uap, REG_ICR); pl011_read(uap, REG_ICR); } static irqreturn_t pl011_int(int irq, void *dev_id) { struct uart_amba_port *uap = dev_id; unsigned int status, pass_counter = AMBA_ISR_PASS_LIMIT; int handled = 0; uart_port_lock(&uap->port); status = pl011_read(uap, REG_RIS) & uap->im; if (status) { do { check_apply_cts_event_workaround(uap); pl011_write(status & ~(UART011_TXIS | UART011_RTIS | UART011_RXIS), uap, REG_ICR); if (status & (UART011_RTIS | UART011_RXIS)) { if (pl011_dma_rx_running(uap)) pl011_dma_rx_irq(uap); else pl011_rx_chars(uap); } if (status & (UART011_DSRMIS | UART011_DCDMIS | UART011_CTSMIS | UART011_RIMIS)) pl011_modem_status(uap); if (status & UART011_TXIS) pl011_tx_chars(uap, true); if (pass_counter-- == 0) break; status = pl011_read(uap, REG_RIS) & uap->im; } while (status != 0); handled = 1; } uart_unlock_and_check_sysrq(&uap->port); return IRQ_RETVAL(handled); } static unsigned int pl011_tx_empty(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); /* Allow feature register bits to be inverted to work around errata */ unsigned int status = pl011_read(uap, REG_FR) ^ uap->vendor->inv_fr; return status & (uap->vendor->fr_busy | UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT; } static void pl011_maybe_set_bit(bool cond, unsigned int *ptr, unsigned int mask) { if (cond) *ptr |= mask; } static unsigned int pl011_get_mctrl(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int result = 0; unsigned int status = pl011_read(uap, REG_FR); pl011_maybe_set_bit(status & UART01x_FR_DCD, &result, TIOCM_CAR); pl011_maybe_set_bit(status & uap->vendor->fr_dsr, &result, TIOCM_DSR); pl011_maybe_set_bit(status & uap->vendor->fr_cts, &result, TIOCM_CTS); pl011_maybe_set_bit(status & uap->vendor->fr_ri, &result, TIOCM_RNG); return result; } static void pl011_assign_bit(bool cond, unsigned int *ptr, unsigned int mask) { if (cond) *ptr |= mask; else *ptr &= ~mask; } static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int cr; cr = pl011_read(uap, REG_CR); pl011_assign_bit(mctrl & TIOCM_RTS, &cr, UART011_CR_RTS); pl011_assign_bit(mctrl & TIOCM_DTR, &cr, UART011_CR_DTR); pl011_assign_bit(mctrl & TIOCM_OUT1, &cr, UART011_CR_OUT1); pl011_assign_bit(mctrl & TIOCM_OUT2, &cr, UART011_CR_OUT2); pl011_assign_bit(mctrl & TIOCM_LOOP, &cr, UART011_CR_LBE); if (port->status & UPSTAT_AUTORTS) { /* We need to disable auto-RTS if we want to turn RTS off */ pl011_assign_bit(mctrl & TIOCM_RTS, &cr, UART011_CR_RTSEN); } pl011_write(cr, uap, REG_CR); } static void pl011_break_ctl(struct uart_port *port, int break_state) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned long flags; unsigned int lcr_h; uart_port_lock_irqsave(&uap->port, &flags); lcr_h = pl011_read(uap, REG_LCRH_TX); if (break_state == -1) lcr_h |= UART01x_LCRH_BRK; else lcr_h &= ~UART01x_LCRH_BRK; pl011_write(lcr_h, uap, REG_LCRH_TX); uart_port_unlock_irqrestore(&uap->port, flags); } #ifdef CONFIG_CONSOLE_POLL static void pl011_quiesce_irqs(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); pl011_write(pl011_read(uap, REG_MIS), uap, REG_ICR); /* * There is no way to clear TXIM as this is "ready to transmit IRQ", so * we simply mask it. start_tx() will unmask it. * * Note we can race with start_tx(), and if the race happens, the * polling user might get another interrupt just after we clear it. * But it should be OK and can happen even w/o the race, e.g. * controller immediately got some new data and raised the IRQ. * * And whoever uses polling routines assumes that it manages the device * (including tx queue), so we're also fine with start_tx()'s caller * side. */ pl011_write(pl011_read(uap, REG_IMSC) & ~UART011_TXIM, uap, REG_IMSC); } static int pl011_get_poll_char(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int status; /* * The caller might need IRQs lowered, e.g. if used with KDB NMI * debugger. */ pl011_quiesce_irqs(port); status = pl011_read(uap, REG_FR); if (status & UART01x_FR_RXFE) return NO_POLL_CHAR; return pl011_read(uap, REG_DR); } static void pl011_put_poll_char(struct uart_port *port, unsigned char ch) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); while (pl011_read(uap, REG_FR) & UART01x_FR_TXFF) cpu_relax(); pl011_write(ch, uap, REG_DR); } #endif /* CONFIG_CONSOLE_POLL */ static int pl011_hwinit(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); int retval; /* Optionaly enable pins to be muxed in and configured */ pinctrl_pm_select_default_state(port->dev); /* * Try to enable the clock producer. */ retval = clk_prepare_enable(uap->clk); if (retval) return retval; uap->port.uartclk = clk_get_rate(uap->clk); /* Clear pending error and receive interrupts */ pl011_write(UART011_OEIS | UART011_BEIS | UART011_PEIS | UART011_FEIS | UART011_RTIS | UART011_RXIS, uap, REG_ICR); /* * Save interrupts enable mask, and enable RX interrupts in case if * the interrupt is used for NMI entry. */ uap->im = pl011_read(uap, REG_IMSC); pl011_write(UART011_RTIM | UART011_RXIM, uap, REG_IMSC); if (dev_get_platdata(uap->port.dev)) { struct amba_pl011_data *plat; plat = dev_get_platdata(uap->port.dev); if (plat->init) plat->init(); } return 0; } static bool pl011_split_lcrh(const struct uart_amba_port *uap) { return pl011_reg_to_offset(uap, REG_LCRH_RX) != pl011_reg_to_offset(uap, REG_LCRH_TX); } static void pl011_write_lcr_h(struct uart_amba_port *uap, unsigned int lcr_h) { pl011_write(lcr_h, uap, REG_LCRH_RX); if (pl011_split_lcrh(uap)) { int i; /* * Wait 10 PCLKs before writing LCRH_TX register, * to get this delay write read only register 10 times */ for (i = 0; i < 10; ++i) pl011_write(0xff, uap, REG_MIS); pl011_write(lcr_h, uap, REG_LCRH_TX); } } static int pl011_allocate_irq(struct uart_amba_port *uap) { pl011_write(uap->im, uap, REG_IMSC); return request_irq(uap->port.irq, pl011_int, IRQF_SHARED, "uart-pl011", uap); } /* * Enable interrupts, only timeouts when using DMA * if initial RX DMA job failed, start in interrupt mode * as well. */ static void pl011_enable_interrupts(struct uart_amba_port *uap) { unsigned long flags; unsigned int i; uart_port_lock_irqsave(&uap->port, &flags); /* Clear out any spuriously appearing RX interrupts */ pl011_write(UART011_RTIS | UART011_RXIS, uap, REG_ICR); /* * RXIS is asserted only when the RX FIFO transitions from below * to above the trigger threshold. If the RX FIFO is already * full to the threshold this can't happen and RXIS will now be * stuck off. Drain the RX FIFO explicitly to fix this: */ for (i = 0; i < uap->fifosize * 2; ++i) { if (pl011_read(uap, REG_FR) & UART01x_FR_RXFE) break; pl011_read(uap, REG_DR); } uap->im = UART011_RTIM; if (!pl011_dma_rx_running(uap)) uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); uart_port_unlock_irqrestore(&uap->port, flags); } static void pl011_unthrottle_rx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned long flags; uart_port_lock_irqsave(&uap->port, &flags); uap->im = UART011_RTIM; if (!pl011_dma_rx_running(uap)) uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); #ifdef CONFIG_DMA_ENGINE if (uap->using_rx_dma) { uap->dmacr |= UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } #endif uart_port_unlock_irqrestore(&uap->port, flags); } static int pl011_startup(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int cr; int retval; retval = pl011_hwinit(port); if (retval) goto clk_dis; retval = pl011_allocate_irq(uap); if (retval) goto clk_dis; pl011_write(uap->vendor->ifls, uap, REG_IFLS); uart_port_lock_irq(&uap->port); cr = pl011_read(uap, REG_CR); cr &= UART011_CR_RTS | UART011_CR_DTR; cr |= UART01x_CR_UARTEN | UART011_CR_RXE; if (!(port->rs485.flags & SER_RS485_ENABLED)) cr |= UART011_CR_TXE; pl011_write(cr, uap, REG_CR); uart_port_unlock_irq(&uap->port); /* * initialise the old status of the modem signals */ uap->old_status = pl011_read(uap, REG_FR) & UART01x_FR_MODEM_ANY; /* Startup DMA */ pl011_dma_startup(uap); pl011_enable_interrupts(uap); return 0; clk_dis: clk_disable_unprepare(uap->clk); return retval; } static int sbsa_uart_startup(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); int retval; retval = pl011_hwinit(port); if (retval) return retval; retval = pl011_allocate_irq(uap); if (retval) return retval; /* The SBSA UART does not support any modem status lines. */ uap->old_status = 0; pl011_enable_interrupts(uap); return 0; } static void pl011_shutdown_channel(struct uart_amba_port *uap, unsigned int lcrh) { unsigned long val; val = pl011_read(uap, lcrh); val &= ~(UART01x_LCRH_BRK | UART01x_LCRH_FEN); pl011_write(val, uap, lcrh); } /* * disable the port. It should not disable RTS and DTR. * Also RTS and DTR state should be preserved to restore * it during startup(). */ static void pl011_disable_uart(struct uart_amba_port *uap) { unsigned int cr; uap->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS); uart_port_lock_irq(&uap->port); cr = pl011_read(uap, REG_CR); cr &= UART011_CR_RTS | UART011_CR_DTR; cr |= UART01x_CR_UARTEN | UART011_CR_TXE; pl011_write(cr, uap, REG_CR); uart_port_unlock_irq(&uap->port); /* * disable break condition and fifos */ pl011_shutdown_channel(uap, REG_LCRH_RX); if (pl011_split_lcrh(uap)) pl011_shutdown_channel(uap, REG_LCRH_TX); } static void pl011_disable_interrupts(struct uart_amba_port *uap) { uart_port_lock_irq(&uap->port); /* mask all interrupts and clear all pending ones */ uap->im = 0; pl011_write(uap->im, uap, REG_IMSC); pl011_write(0xffff, uap, REG_ICR); uart_port_unlock_irq(&uap->port); } static void pl011_shutdown(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); pl011_disable_interrupts(uap); pl011_dma_shutdown(uap); if ((port->rs485.flags & SER_RS485_ENABLED && uap->rs485_tx_state != OFF)) pl011_rs485_tx_stop(uap); free_irq(uap->port.irq, uap); pl011_disable_uart(uap); /* * Shut down the clock producer */ clk_disable_unprepare(uap->clk); /* Optionally let pins go into sleep states */ pinctrl_pm_select_sleep_state(port->dev); if (dev_get_platdata(uap->port.dev)) { struct amba_pl011_data *plat; plat = dev_get_platdata(uap->port.dev); if (plat->exit) plat->exit(); } if (uap->port.ops->flush_buffer) uap->port.ops->flush_buffer(port); } static void sbsa_uart_shutdown(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); pl011_disable_interrupts(uap); free_irq(uap->port.irq, uap); if (uap->port.ops->flush_buffer) uap->port.ops->flush_buffer(port); } static void pl011_setup_status_masks(struct uart_port *port, struct ktermios *termios) { port->read_status_mask = UART011_DR_OE | 255; if (termios->c_iflag & INPCK) port->read_status_mask |= UART011_DR_FE | UART011_DR_PE; if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) port->read_status_mask |= UART011_DR_BE; /* * Characters to ignore */ port->ignore_status_mask = 0; if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART011_DR_FE | UART011_DR_PE; if (termios->c_iflag & IGNBRK) { port->ignore_status_mask |= UART011_DR_BE; /* * If we're ignoring parity and break indicators, * ignore overruns too (for real raw support). */ if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART011_DR_OE; } /* * Ignore all characters if CREAD is not set. */ if ((termios->c_cflag & CREAD) == 0) port->ignore_status_mask |= UART_DUMMY_DR_RX; } static void pl011_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int lcr_h, old_cr; unsigned long flags; unsigned int baud, quot, clkdiv; unsigned int bits; if (uap->vendor->oversampling) clkdiv = 8; else clkdiv = 16; /* * Ask the core to calculate the divisor for us. */ baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / clkdiv); #ifdef CONFIG_DMA_ENGINE /* * Adjust RX DMA polling rate with baud rate if not specified. */ if (uap->dmarx.auto_poll_rate) uap->dmarx.poll_rate = DIV_ROUND_UP(10000000, baud); #endif if (baud > port->uartclk / 16) quot = DIV_ROUND_CLOSEST(port->uartclk * 8, baud); else quot = DIV_ROUND_CLOSEST(port->uartclk * 4, baud); switch (termios->c_cflag & CSIZE) { case CS5: lcr_h = UART01x_LCRH_WLEN_5; break; case CS6: lcr_h = UART01x_LCRH_WLEN_6; break; case CS7: lcr_h = UART01x_LCRH_WLEN_7; break; default: // CS8 lcr_h = UART01x_LCRH_WLEN_8; break; } if (termios->c_cflag & CSTOPB) lcr_h |= UART01x_LCRH_STP2; if (termios->c_cflag & PARENB) { lcr_h |= UART01x_LCRH_PEN; if (!(termios->c_cflag & PARODD)) lcr_h |= UART01x_LCRH_EPS; if (termios->c_cflag & CMSPAR) lcr_h |= UART011_LCRH_SPS; } if (uap->fifosize > 1) lcr_h |= UART01x_LCRH_FEN; bits = tty_get_frame_size(termios->c_cflag); uart_port_lock_irqsave(port, &flags); /* * Update the per-port timeout. */ uart_update_timeout(port, termios->c_cflag, baud); /* * Calculate the approximated time it takes to transmit one character * with the given baud rate. We use this as the poll interval when we * wait for the tx queue to empty. */ uap->rs485_tx_drain_interval = ns_to_ktime(DIV_ROUND_UP(bits * NSEC_PER_SEC, baud)); pl011_setup_status_masks(port, termios); if (UART_ENABLE_MS(port, termios->c_cflag)) pl011_enable_ms(port); if (port->rs485.flags & SER_RS485_ENABLED) termios->c_cflag &= ~CRTSCTS; old_cr = pl011_read(uap, REG_CR); if (termios->c_cflag & CRTSCTS) { if (old_cr & UART011_CR_RTS) old_cr |= UART011_CR_RTSEN; old_cr |= UART011_CR_CTSEN; port->status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS; } else { old_cr &= ~(UART011_CR_CTSEN | UART011_CR_RTSEN); port->status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS); } if (uap->vendor->oversampling) { if (baud > port->uartclk / 16) old_cr |= ST_UART011_CR_OVSFACT; else old_cr &= ~ST_UART011_CR_OVSFACT; } /* * Workaround for the ST Micro oversampling variants to * increase the bitrate slightly, by lowering the divisor, * to avoid delayed sampling of start bit at high speeds, * else we see data corruption. */ if (uap->vendor->oversampling) { if (baud >= 3000000 && baud < 3250000 && quot > 1) quot -= 1; else if (baud > 3250000 && quot > 2) quot -= 2; } /* Set baud rate */ pl011_write(quot & 0x3f, uap, REG_FBRD); pl011_write(quot >> 6, uap, REG_IBRD); /* * ----------v----------v----------v----------v----- * NOTE: REG_LCRH_TX and REG_LCRH_RX MUST BE WRITTEN AFTER * REG_FBRD & REG_IBRD. * ----------^----------^----------^----------^----- */ pl011_write_lcr_h(uap, lcr_h); /* * Receive was disabled by pl011_disable_uart during shutdown. * Need to reenable receive if you need to use a tty_driver * returns from tty_find_polling_driver() after a port shutdown. */ old_cr |= UART011_CR_RXE; pl011_write(old_cr, uap, REG_CR); uart_port_unlock_irqrestore(port, flags); } static void sbsa_uart_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned long flags; tty_termios_encode_baud_rate(termios, uap->fixed_baud, uap->fixed_baud); /* The SBSA UART only supports 8n1 without hardware flow control. */ termios->c_cflag &= ~(CSIZE | CSTOPB | PARENB | PARODD); termios->c_cflag &= ~(CMSPAR | CRTSCTS); termios->c_cflag |= CS8 | CLOCAL; uart_port_lock_irqsave(port, &flags); uart_update_timeout(port, CS8, uap->fixed_baud); pl011_setup_status_masks(port, termios); uart_port_unlock_irqrestore(port, flags); } static const char *pl011_type(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); return uap->port.type == PORT_AMBA ? uap->type : NULL; } /* * Configure/autoconfigure the port. */ static void pl011_config_port(struct uart_port *port, int flags) { if (flags & UART_CONFIG_TYPE) port->type = PORT_AMBA; } /* * verify the new serial_struct (for TIOCSSERIAL). */ static int pl011_verify_port(struct uart_port *port, struct serial_struct *ser) { int ret = 0; if (ser->type != PORT_UNKNOWN && ser->type != PORT_AMBA) ret = -EINVAL; if (ser->irq < 0 || ser->irq >= irq_get_nr_irqs()) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; if (port->mapbase != (unsigned long)ser->iomem_base) ret = -EINVAL; return ret; } static int pl011_rs485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if (port->rs485.flags & SER_RS485_ENABLED) pl011_rs485_tx_stop(uap); /* Make sure auto RTS is disabled */ if (rs485->flags & SER_RS485_ENABLED) { u32 cr = pl011_read(uap, REG_CR); cr &= ~UART011_CR_RTSEN; pl011_write(cr, uap, REG_CR); port->status &= ~UPSTAT_AUTORTS; } return 0; } static const struct uart_ops amba_pl011_pops = { .tx_empty = pl011_tx_empty, .set_mctrl = pl011_set_mctrl, .get_mctrl = pl011_get_mctrl, .stop_tx = pl011_stop_tx, .start_tx = pl011_start_tx, .stop_rx = pl011_stop_rx, .throttle = pl011_throttle_rx, .unthrottle = pl011_unthrottle_rx, .enable_ms = pl011_enable_ms, .break_ctl = pl011_break_ctl, .startup = pl011_startup, .shutdown = pl011_shutdown, .flush_buffer = pl011_dma_flush_buffer, .set_termios = pl011_set_termios, .type = pl011_type, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL .poll_init = pl011_hwinit, .poll_get_char = pl011_get_poll_char, .poll_put_char = pl011_put_poll_char, #endif }; static void sbsa_uart_set_mctrl(struct uart_port *port, unsigned int mctrl) { } static unsigned int sbsa_uart_get_mctrl(struct uart_port *port) { return 0; } static const struct uart_ops sbsa_uart_pops = { .tx_empty = pl011_tx_empty, .set_mctrl = sbsa_uart_set_mctrl, .get_mctrl = sbsa_uart_get_mctrl, .stop_tx = pl011_stop_tx, .start_tx = pl011_start_tx, .stop_rx = pl011_stop_rx, .startup = sbsa_uart_startup, .shutdown = sbsa_uart_shutdown, .set_termios = sbsa_uart_set_termios, .type = pl011_type, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL .poll_init = pl011_hwinit, .poll_get_char = pl011_get_poll_char, .poll_put_char = pl011_put_poll_char, #endif }; static struct uart_amba_port *amba_ports[UART_NR]; #ifdef CONFIG_SERIAL_AMBA_PL011_CONSOLE static void pl011_console_putchar(struct uart_port *port, unsigned char ch) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); while (pl011_read(uap, REG_FR) & UART01x_FR_TXFF) cpu_relax(); pl011_write(ch, uap, REG_DR); } static void pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; unsigned long flags; int locked = 1; clk_enable(uap->clk); if (oops_in_progress) locked = uart_port_trylock_irqsave(&uap->port, &flags); else uart_port_lock_irqsave(&uap->port, &flags); /* * First save the CR then disable the interrupts */ if (!uap->vendor->always_enabled) { old_cr = pl011_read(uap, REG_CR); new_cr = old_cr & ~UART011_CR_CTSEN; new_cr |= UART01x_CR_UARTEN | UART011_CR_TXE; pl011_write(new_cr, uap, REG_CR); } uart_console_write(&uap->port, s, count, pl011_console_putchar); /* * Finally, wait for transmitter to become empty and restore the * TCR. Allow feature register bits to be inverted to work around * errata. */ while ((pl011_read(uap, REG_FR) ^ uap->vendor->inv_fr) & uap->vendor->fr_busy) cpu_relax(); if (!uap->vendor->always_enabled) pl011_write(old_cr, uap, REG_CR); if (locked) uart_port_unlock_irqrestore(&uap->port, flags); clk_disable(uap->clk); } static void pl011_console_get_options(struct uart_amba_port *uap, int *baud, int *parity, int *bits) { unsigned int lcr_h, ibrd, fbrd; if (!(pl011_read(uap, REG_CR) & UART01x_CR_UARTEN)) return; lcr_h = pl011_read(uap, REG_LCRH_TX); *parity = 'n'; if (lcr_h & UART01x_LCRH_PEN) { if (lcr_h & UART01x_LCRH_EPS) *parity = 'e'; else *parity = 'o'; } if ((lcr_h & 0x60) == UART01x_LCRH_WLEN_7) *bits = 7; else *bits = 8; ibrd = pl011_read(uap, REG_IBRD); fbrd = pl011_read(uap, REG_FBRD); *baud = uap->port.uartclk * 4 / (64 * ibrd + fbrd); if (uap->vendor->oversampling && (pl011_read(uap, REG_CR) & ST_UART011_CR_OVSFACT)) *baud *= 2; } static int pl011_console_setup(struct console *co, char *options) { struct uart_amba_port *uap; int baud = 38400; int bits = 8; int parity = 'n'; int flow = 'n'; int ret; /* * Check whether an invalid uart number has been specified, and * if so, search for the first available port that does have * console support. */ if (co->index >= UART_NR) co->index = 0; uap = amba_ports[co->index]; if (!uap) return -ENODEV; /* Allow pins to be muxed in and configured */ pinctrl_pm_select_default_state(uap->port.dev); ret = clk_prepare(uap->clk); if (ret) return ret; if (dev_get_platdata(uap->port.dev)) { struct amba_pl011_data *plat; plat = dev_get_platdata(uap->port.dev); if (plat->init) plat->init(); } uap->port.uartclk = clk_get_rate(uap->clk); if (uap->vendor->fixed_options) { baud = uap->fixed_baud; } else { if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else pl011_console_get_options(uap, &baud, &parity, &bits); } return uart_set_options(&uap->port, co, baud, parity, bits, flow); } /** * pl011_console_match - non-standard console matching * @co: registering console * @name: name from console command line * @idx: index from console command line * @options: ptr to option string from console command line * * Only attempts to match console command lines of the form: * console=pl011,mmio|mmio32,<addr>[,<options>] * console=pl011,0x<addr>[,<options>] * This form is used to register an initial earlycon boot console and * replace it with the amba_console at pl011 driver init. * * Performs console setup for a match (as required by interface) * If no <options> are specified, then assume the h/w is already setup. * * Returns 0 if console matches; otherwise non-zero to use default matching */ static int pl011_console_match(struct console *co, char *name, int idx, char *options) { unsigned char iotype; resource_size_t addr; int i; /* * Systems affected by the Qualcomm Technologies QDF2400 E44 erratum * have a distinct console name, so make sure we check for that. * The actual implementation of the erratum occurs in the probe * function. */ if ((strcmp(name, "qdf2400_e44") != 0) && (strcmp(name, "pl011") != 0)) return -ENODEV; if (uart_parse_earlycon(options, &iotype, &addr, &options)) return -ENODEV; if (iotype != UPIO_MEM && iotype != UPIO_MEM32) return -ENODEV; /* try to match the port specified on the command line */ for (i = 0; i < ARRAY_SIZE(amba_ports); i++) { struct uart_port *port; if (!amba_ports[i]) continue; port = &amba_ports[i]->port; if (port->mapbase != addr) continue; co->index = i; uart_port_set_cons(port, co); return pl011_console_setup(co, options); } return -ENODEV; } static struct uart_driver amba_reg; static struct console amba_console = { .name = "ttyAMA", .write = pl011_console_write, .device = uart_console_device, .setup = pl011_console_setup, .match = pl011_console_match, .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &amba_reg, }; #define AMBA_CONSOLE (&amba_console) static void qdf2400_e44_putc(struct uart_port *port, unsigned char c) { while (readl(port->membase + UART01x_FR) & UART01x_FR_TXFF) cpu_relax(); writel(c, port->membase + UART01x_DR); while (!(readl(port->membase + UART01x_FR) & UART011_FR_TXFE)) cpu_relax(); } static void qdf2400_e44_early_write(struct console *con, const char *s, unsigned int n) { struct earlycon_device *dev = con->data; uart_console_write(&dev->port, s, n, qdf2400_e44_putc); } static void pl011_putc(struct uart_port *port, unsigned char c) { while (readl(port->membase + UART01x_FR) & UART01x_FR_TXFF) cpu_relax(); if (port->iotype == UPIO_MEM32) writel(c, port->membase + UART01x_DR); else writeb(c, port->membase + UART01x_DR); while (readl(port->membase + UART01x_FR) & UART01x_FR_BUSY) cpu_relax(); } static void pl011_early_write(struct console *con, const char *s, unsigned int n) { struct earlycon_device *dev = con->data; uart_console_write(&dev->port, s, n, pl011_putc); } #ifdef CONFIG_CONSOLE_POLL static int pl011_getc(struct uart_port *port) { if (readl(port->membase + UART01x_FR) & UART01x_FR_RXFE) return NO_POLL_CHAR; if (port->iotype == UPIO_MEM32) return readl(port->membase + UART01x_DR); else return readb(port->membase + UART01x_DR); } static int pl011_early_read(struct console *con, char *s, unsigned int n) { struct earlycon_device *dev = con->data; int ch, num_read = 0; while (num_read < n) { ch = pl011_getc(&dev->port); if (ch == NO_POLL_CHAR) break; s[num_read++] = ch; } return num_read; } #else #define pl011_early_read NULL #endif /* * On non-ACPI systems, earlycon is enabled by specifying * "earlycon=pl011,<address>" on the kernel command line. * * On ACPI ARM64 systems, an "early" console is enabled via the SPCR table, * by specifying only "earlycon" on the command line. Because it requires * SPCR, the console starts after ACPI is parsed, which is later than a * traditional early console. * * To get the traditional early console that starts before ACPI is parsed, * specify the full "earlycon=pl011,<address>" option. */ static int __init pl011_early_console_setup(struct earlycon_device *device, const char *opt) { if (!device->port.membase) return -ENODEV; device->con->write = pl011_early_write; device->con->read = pl011_early_read; return 0; } OF_EARLYCON_DECLARE(pl011, "arm,pl011", pl011_early_console_setup); OF_EARLYCON_DECLARE(pl011, "arm,sbsa-uart", pl011_early_console_setup); /* * On Qualcomm Datacenter Technologies QDF2400 SOCs affected by * Erratum 44, traditional earlycon can be enabled by specifying * "earlycon=qdf2400_e44,<address>". Any options are ignored. * * Alternatively, you can just specify "earlycon", and the early console * will be enabled with the information from the SPCR table. In this * case, the SPCR code will detect the need for the E44 work-around, * and set the console name to "qdf2400_e44". */ static int __init qdf2400_e44_early_console_setup(struct earlycon_device *device, const char *opt) { if (!device->port.membase) return -ENODEV; device->con->write = qdf2400_e44_early_write; return 0; } EARLYCON_DECLARE(qdf2400_e44, qdf2400_e44_early_console_setup); #else #define AMBA_CONSOLE NULL #endif static struct uart_driver amba_reg = { .owner = THIS_MODULE, .driver_name = "ttyAMA", .dev_name = "ttyAMA", .major = SERIAL_AMBA_MAJOR, .minor = SERIAL_AMBA_MINOR, .nr = UART_NR, .cons = AMBA_CONSOLE, }; static int pl011_probe_dt_alias(int index, struct device *dev) { struct device_node *np; static bool seen_dev_with_alias; static bool seen_dev_without_alias; int ret = index; if (!IS_ENABLED(CONFIG_OF)) return ret; np = dev->of_node; if (!np) return ret; ret = of_alias_get_id(np, "serial"); if (ret < 0) { seen_dev_without_alias = true; ret = index; } else { seen_dev_with_alias = true; if (ret >= ARRAY_SIZE(amba_ports) || amba_ports[ret]) { dev_warn(dev, "requested serial port %d not available.\n", ret); ret = index; } } if (seen_dev_with_alias && seen_dev_without_alias) dev_warn(dev, "aliased and non-aliased serial devices found in device tree. Serial port enumeration may be unpredictable.\n"); return ret; } /* unregisters the driver also if no more ports are left */ static void pl011_unregister_port(struct uart_amba_port *uap) { int i; bool busy = false; for (i = 0; i < ARRAY_SIZE(amba_ports); i++) { if (amba_ports[i] == uap) amba_ports[i] = NULL; else if (amba_ports[i]) busy = true; } pl011_dma_remove(uap); if (!busy) uart_unregister_driver(&amba_reg); } static int pl011_find_free_port(void) { int i; for (i = 0; i < ARRAY_SIZE(amba_ports); i++) if (!amba_ports[i]) return i; return -EBUSY; } static int pl011_setup_port(struct device *dev, struct uart_amba_port *uap, struct resource *mmiobase, int index) { void __iomem *base; int ret; base = devm_ioremap_resource(dev, mmiobase); if (IS_ERR(base)) return PTR_ERR(base); index = pl011_probe_dt_alias(index, dev); uap->port.dev = dev; uap->port.mapbase = mmiobase->start; uap->port.membase = base; uap->port.fifosize = uap->fifosize; uap->port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_AMBA_PL011_CONSOLE); uap->port.flags = UPF_BOOT_AUTOCONF; uap->port.line = index; ret = uart_get_rs485_mode(&uap->port); if (ret) return ret; amba_ports[index] = uap; return 0; } static int pl011_register_port(struct uart_amba_port *uap) { int ret, i; /* Ensure interrupts from this UART are masked and cleared */ pl011_write(0, uap, REG_IMSC); pl011_write(0xffff, uap, REG_ICR); if (!amba_reg.state) { ret = uart_register_driver(&amba_reg); if (ret < 0) { dev_err(uap->port.dev, "Failed to register AMBA-PL011 driver\n"); for (i = 0; i < ARRAY_SIZE(amba_ports); i++) if (amba_ports[i] == uap) amba_ports[i] = NULL; return ret; } } ret = uart_add_one_port(&amba_reg, &uap->port); if (ret) pl011_unregister_port(uap); return ret; } static const struct serial_rs485 pl011_rs485_supported = { .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND | SER_RS485_RX_DURING_TX, .delay_rts_before_send = 1, .delay_rts_after_send = 1, }; static int pl011_probe(struct amba_device *dev, const struct amba_id *id) { struct uart_amba_port *uap; struct vendor_data *vendor = id->data; int portnr, ret; u32 val; portnr = pl011_find_free_port(); if (portnr < 0) return portnr; uap = devm_kzalloc(&dev->dev, sizeof(struct uart_amba_port), GFP_KERNEL); if (!uap) return -ENOMEM; uap->clk = devm_clk_get(&dev->dev, NULL); if (IS_ERR(uap->clk)) return PTR_ERR(uap->clk); uap->reg_offset = vendor->reg_offset; uap->vendor = vendor; uap->fifosize = vendor->get_fifosize(dev); uap->port.iotype = vendor->access_32b ? UPIO_MEM32 : UPIO_MEM; uap->port.irq = dev->irq[0]; uap->port.ops = &amba_pl011_pops; uap->port.rs485_config = pl011_rs485_config; uap->port.rs485_supported = pl011_rs485_supported; snprintf(uap->type, sizeof(uap->type), "PL011 rev%u", amba_rev(dev)); if (device_property_read_u32(&dev->dev, "reg-io-width", &val) == 0) { switch (val) { case 1: uap->port.iotype = UPIO_MEM; break; case 4: uap->port.iotype = UPIO_MEM32; break; default: dev_warn(&dev->dev, "unsupported reg-io-width (%d)\n", val); return -EINVAL; } } hrtimer_init(&uap->trigger_start_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(&uap->trigger_stop_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); uap->trigger_start_tx.function = pl011_trigger_start_tx; uap->trigger_stop_tx.function = pl011_trigger_stop_tx; ret = pl011_setup_port(&dev->dev, uap, &dev->res, portnr); if (ret) return ret; amba_set_drvdata(dev, uap); return pl011_register_port(uap); } static void pl011_remove(struct amba_device *dev) { struct uart_amba_port *uap = amba_get_drvdata(dev); uart_remove_one_port(&amba_reg, &uap->port); pl011_unregister_port(uap); } #ifdef CONFIG_PM_SLEEP static int pl011_suspend(struct device *dev) { struct uart_amba_port *uap = dev_get_drvdata(dev); if (!uap) return -EINVAL; return uart_suspend_port(&amba_reg, &uap->port); } static int pl011_resume(struct device *dev) { struct uart_amba_port *uap = dev_get_drvdata(dev); if (!uap) return -EINVAL; return uart_resume_port(&amba_reg, &uap->port); } #endif static SIMPLE_DEV_PM_OPS(pl011_dev_pm_ops, pl011_suspend, pl011_resume); #ifdef CONFIG_ACPI_SPCR_TABLE static void qpdf2400_erratum44_workaround(struct device *dev, struct uart_amba_port *uap) { if (!qdf2400_e44_present) return; dev_info(dev, "working around QDF2400 SoC erratum 44\n"); uap->vendor = &vendor_qdt_qdf2400_e44; } #else static void qpdf2400_erratum44_workaround(struct device *dev, struct uart_amba_port *uap) { /* empty */ } #endif static int sbsa_uart_probe(struct platform_device *pdev) { struct uart_amba_port *uap; struct resource *r; int portnr, ret; int baudrate; /* * Check the mandatory baud rate parameter in the DT node early * so that we can easily exit with the error. */ if (pdev->dev.of_node) { struct device_node *np = pdev->dev.of_node; ret = of_property_read_u32(np, "current-speed", &baudrate); if (ret) return ret; } else { baudrate = 115200; } portnr = pl011_find_free_port(); if (portnr < 0) return portnr; uap = devm_kzalloc(&pdev->dev, sizeof(struct uart_amba_port), GFP_KERNEL); if (!uap) return -ENOMEM; ret = platform_get_irq(pdev, 0); if (ret < 0) return ret; uap->port.irq = ret; uap->vendor = &vendor_sbsa; qpdf2400_erratum44_workaround(&pdev->dev, uap); uap->reg_offset = uap->vendor->reg_offset; uap->fifosize = 32; uap->port.iotype = uap->vendor->access_32b ? UPIO_MEM32 : UPIO_MEM; uap->port.ops = &sbsa_uart_pops; uap->fixed_baud = baudrate; snprintf(uap->type, sizeof(uap->type), "SBSA"); r = platform_get_resource(pdev, IORESOURCE_MEM, 0); ret = pl011_setup_port(&pdev->dev, uap, r, portnr); if (ret) return ret; platform_set_drvdata(pdev, uap); return pl011_register_port(uap); } static void sbsa_uart_remove(struct platform_device *pdev) { struct uart_amba_port *uap = platform_get_drvdata(pdev); uart_remove_one_port(&amba_reg, &uap->port); pl011_unregister_port(uap); } static const struct of_device_id sbsa_uart_of_match[] = { { .compatible = "arm,sbsa-uart", }, {}, }; MODULE_DEVICE_TABLE(of, sbsa_uart_of_match); static const struct acpi_device_id __maybe_unused sbsa_uart_acpi_match[] = { { "ARMH0011", 0 }, { "ARMHB000", 0 }, {}, }; MODULE_DEVICE_TABLE(acpi, sbsa_uart_acpi_match); static struct platform_driver arm_sbsa_uart_platform_driver = { .probe = sbsa_uart_probe, .remove = sbsa_uart_remove, .driver = { .name = "sbsa-uart", .pm = &pl011_dev_pm_ops, .of_match_table = of_match_ptr(sbsa_uart_of_match), .acpi_match_table = ACPI_PTR(sbsa_uart_acpi_match), .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_AMBA_PL011), }, }; static const struct amba_id pl011_ids[] = { { .id = 0x00041011, .mask = 0x000fffff, .data = &vendor_arm, }, { .id = 0x00380802, .mask = 0x00ffffff, .data = &vendor_st, }, { 0, 0 }, }; MODULE_DEVICE_TABLE(amba, pl011_ids); static struct amba_driver pl011_driver = { .drv = { .name = "uart-pl011", .pm = &pl011_dev_pm_ops, .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_AMBA_PL011), }, .id_table = pl011_ids, .probe = pl011_probe, .remove = pl011_remove, }; static int __init pl011_init(void) { pr_info("Serial: AMBA PL011 UART driver\n"); if (platform_driver_register(&arm_sbsa_uart_platform_driver)) pr_warn("could not register SBSA UART platform driver\n"); return amba_driver_register(&pl011_driver); } static void __exit pl011_exit(void) { platform_driver_unregister(&arm_sbsa_uart_platform_driver); amba_driver_unregister(&pl011_driver); } /* * While this can be a module, if builtin it's most likely the console * So let's leave module_exit but move module_init to an earlier place */ arch_initcall(pl011_init); module_exit(pl011_exit); MODULE_AUTHOR("ARM Ltd/Deep Blue Solutions Ltd"); MODULE_DESCRIPTION("ARM AMBA serial port driver"); MODULE_LICENSE("GPL");
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 /* Copyright (c) 2018, Mellanox Technologies All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <crypto/aead.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/netdevice.h> #include <net/dst.h> #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> #include <linux/skbuff_ref.h> #include "tls.h" #include "trace.h" /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ static DECLARE_RWSEM(device_offload_lock); static struct workqueue_struct *destruct_wq __read_mostly; static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); static struct page *dummy_page; static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) kfree(tls_offload_ctx_tx(ctx)); if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); tls_ctx_free(NULL, ctx); } static void tls_device_tx_del_task(struct work_struct *work) { struct tls_offload_context_tx *offload_ctx = container_of(work, struct tls_offload_context_tx, destruct_work); struct tls_context *ctx = offload_ctx->ctx; struct net_device *netdev; /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); ctx->netdev = NULL; tls_device_free_ctx(ctx); } static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { struct net_device *netdev; unsigned long flags; bool async_cleanup; spin_lock_irqsave(&tls_device_lock, flags); if (unlikely(!refcount_dec_and_test(&ctx->refcount))) { spin_unlock_irqrestore(&tls_device_lock, flags); return; } list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */ /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); async_cleanup = netdev && ctx->tx_conf == TLS_HW; if (async_cleanup) { struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx); /* queue_work inside the spinlock * to make sure tls_device_down waits for that work. */ queue_work(destruct_wq, &offload_ctx->destruct_work); } spin_unlock_irqrestore(&tls_device_lock, flags); if (!async_cleanup) tls_device_free_ctx(ctx); } /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { struct dst_entry *dst = sk_dst_get(sk); struct net_device *netdev = NULL; if (likely(dst)) { netdev = netdev_sk_get_lowest_dev(dst->dev, sk); dev_hold(netdev); } dst_release(dst); return netdev; } static void destroy_record(struct tls_record_info *record) { int i; for (i = 0; i < record->num_frags; i++) __skb_frag_unref(&record->frags[i], false); kfree(record); } static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { list_del(&info->list); destroy_record(info); } offload_ctx->retransmit_hint = NULL; } static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; if (info && !before(acked_seq, info->end_seq)) ctx->retransmit_hint = NULL; list_for_each_entry_safe(info, temp, &ctx->records_list, list) { if (before(acked_seq, info->end_seq)) break; list_del(&info->list); destroy_record(info); deleted_records++; } ctx->unacked_record_sn += deleted_records; spin_unlock_irqrestore(&ctx->lock, flags); } /* At this point, there should be no references on this * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); tls_ctx->sk_destruct(sk); if (tls_ctx->tx_conf == TLS_HW) { if (ctx->open_record) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); clean_acked_data_disable(inet_csk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_free_partial_record(sk, tls_ctx); } void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); } EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { struct net_device *netdev; int err = 0; u8 *rcd_sn; tcp_write_collapse_fence(sk); rcd_sn = tls_ctx->tx.rec_seq; trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (netdev) err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_TX); up_read(&device_offload_lock); if (err) return; clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); } static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) { skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; if (skb_frag_page(frag) == pfrag->page && skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { skb_frag_size_add(frag, size); } else { ++frag; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, size); ++record->num_frags; get_page(pfrag->page); } pfrag->offset += size; record->len += size; } static int tls_push_record(struct sock *sk, struct tls_context *ctx, struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, int flags) { struct tls_prot_info *prot = &ctx->prot_info; struct tcp_sock *tp = tcp_sk(sk); skb_frag_t *frag; int i; record->end_seq = tp->write_seq + record->len; list_add_tail_rcu(&record->list, &offload_ctx->records_list); offload_ctx->open_record = NULL; if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) tls_device_resync_tx(sk, ctx, tp->write_seq); tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); /* all ready, send */ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } static void tls_device_record_close(struct sock *sk, struct tls_context *ctx, struct tls_record_info *record, struct page_frag *pfrag, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) * if we can't allocate memory now use the dummy page */ if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { dummy_tag_frag.page = dummy_page; dummy_tag_frag.offset = 0; pfrag = &dummy_tag_frag; } tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { struct tls_record_info *record; skb_frag_t *frag; record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) return -ENOMEM; frag = &record->frags[0]; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; record->num_frags = 1; record->len = prepend_size; offload_ctx->open_record = record; return 0; } static int tls_do_allocation(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { int ret; if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); if (ret) return ret; if (pfrag->size > pfrag->offset) return 0; } if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; return 0; } static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) { size_t pre_copy, nocache; pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1); if (pre_copy) { pre_copy = min(pre_copy, bytes); if (copy_from_iter(addr, pre_copy, i) != pre_copy) return -EFAULT; bytes -= pre_copy; addr += pre_copy; } nocache = round_down(bytes, SMP_CACHE_BYTES); if (copy_from_iter_nocache(addr, nocache, i) != nocache) return -EFAULT; bytes -= nocache; addr += nocache; if (bytes && copy_from_iter(addr, bytes, i) != bytes) return -EFAULT; return 0; } static int tls_push_data(struct sock *sk, struct iov_iter *iter, size_t size, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); struct tls_record_info *record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; bool more = false; bool done = false; int copy, rc = 0; long timeo; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) return -EINVAL; if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; tls_push_record_flags = flags | MSG_MORE; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); if (rc < 0) return rc; } pfrag = sk_page_frag(sk); /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; record = ctx->open_record; if (!record) break; handle_error: if (record_type != TLS_RECORD_TYPE_DATA) { /* avoid sending partial * record with type != * application_data */ size = orig_size; destroy_record(record); ctx->open_record = NULL; } else if (record->len > prot->prepend_size) { goto last_record; } break; } record = ctx->open_record; copy = min_t(size_t, size, max_open_record_len - record->len); if (copy && (flags & MSG_SPLICE_PAGES)) { struct page_frag zc_pfrag; struct page **pages = &zc_pfrag.page; size_t off; rc = iov_iter_extract_pages(iter, &pages, copy, 1, 0, &off); if (rc <= 0) { if (rc == 0) rc = -EIO; goto handle_error; } copy = rc; if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { iov_iter_revert(iter, copy); rc = -EIO; goto handle_error; } zc_pfrag.offset = off; zc_pfrag.size = copy; tls_append_frag(record, &zc_pfrag, copy); } else if (copy) { copy = min_t(size_t, copy, pfrag->size - pfrag->offset); rc = tls_device_copy_data(page_address(pfrag->page) + pfrag->offset, copy, iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); } size -= copy; if (!size) { last_record: tls_push_record_flags = flags; if (flags & MSG_MORE) { more = true; break; } done = true; } if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { tls_device_record_close(sk, tls_ctx, record, pfrag, record_type); rc = tls_push_record(sk, tls_ctx, ctx, record, tls_push_record_flags); if (rc < 0) break; } } while (!done); tls_ctx->pending_open_record_frags = more; if (orig_size - size > 0) rc = orig_size - size; return rc; } int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; if (!tls_ctx->zerocopy_sendfile) msg->msg_flags &= ~MSG_SPLICE_PAGES; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { rc = tls_process_cmsg(sk, msg, &record_type); if (rc) goto out; } rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags, record_type); out: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return rc; } void tls_device_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter iter = {}; if (!tls_is_partially_sent_record(tls_ctx)) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (tls_is_partially_sent_record(tls_ctx)) { iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0); tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA); } release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || before(seq, info->end_seq - info->len)) { /* if retransmit_hint is irrelevant start * from the beginning of the list */ info = list_first_entry_or_null(&context->records_list, struct tls_record_info, list); if (!info) return NULL; /* send the start_marker record if seq number is before the * tls offload start marker sequence number. This record is * required to handle TCP packets which are before TLS offload * started. * And if it's not start marker, look if this seq number * belongs to the list. */ if (likely(!tls_record_is_start_marker(info))) { /* we have the first record, get the last record to see * if this seq number belongs to the list. */ last = list_last_entry(&context->records_list, struct tls_record_info, list); if (!between(seq, tls_record_start_seq(info), last->end_seq)) return NULL; } record_sn = context->unacked_record_sn; } /* We just need the _rcu for the READ_ONCE() */ rcu_read_lock(); list_for_each_entry_from_rcu(info, &context->records_list, list) { if (before(seq, info->end_seq)) { if (!context->retransmit_hint || after(info->end_seq, context->retransmit_hint->end_seq)) { context->hint_record_sn = record_sn; context->retransmit_hint = info; } *p_record_sn = record_sn; goto exit_rcu_unlock; } record_sn++; } info = NULL; exit_rcu_unlock: rcu_read_unlock(); return info; } EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0); return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; WARN_ON_ONCE(sk->sk_write_pending); sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); rcu_read_lock(); netdev = rcu_dereference(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); rcu_read_unlock(); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); u16 i; *rcd_delta = 0; if (is_async) { /* shouldn't get to wraparound: * too long in async stage, something bad happened */ if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) return false; /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ if (before(*seq, req_seq)) return false; if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; resync_async->rcd_delta++; return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ for (i = 0; i < resync_async->loglen; i++) if (req_seq == resync_async->log[i] && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; resync_async->loglen = 0; resync_async->rcd_delta = 0; return true; } resync_async->loglen = 0; resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) return true; return false; } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) return; if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) return; prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); switch (rx_ctx->resync_type) { case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; is_req_pending = resync_req; if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: if (likely(!rx_ctx->resync_nh_do_now)) return; /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ sock_data = tcp_inq(sk); if (sock_data > rcd_len) { trace_tls_device_rx_resync_nh_delay(sk, sock_data, rcd_len); return; } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: resync_req = atomic64_read(&rx_ctx->resync_async->req); is_req_pending = resync_req; if (likely(!is_req_pending)) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, resync_req, &seq, &rcd_delta)) return; tls_bigint_subtract(rcd_sn, rcd_delta); break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); } static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, struct tls_offload_context_rx *ctx, struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm; /* device will request resyncs by itself based on stream scan */ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) return; /* already scheduled */ if (ctx->resync_nh_do_now) return; /* seen decrypted fragments since last fully-failed record */ if (ctx->resync_nh_reset) { ctx->resync_nh_reset = 0; ctx->resync_nh.decrypted_failed = 1; ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; return; } if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) return; /* doing resync, bump the next target in case it fails */ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) ctx->resync_nh.decrypted_tgt *= 2; else ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; rxm = strp_msg(skb); /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); tls_bigint_increment(rcd_sn, prot->rec_seq_size); tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, rcd_sn); } } static int tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); const struct tls_cipher_desc *cipher_desc; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; struct strp_msg *rxm; char *orig_buf, *buf; cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type); DEBUG_NET_WARN_ON_ONCE(!cipher_desc || !cipher_desc->offloadable); rxm = strp_msg(tls_strp_msg(sw_ctx)); orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv, sk->sk_allocation); if (!orig_buf) return -ENOMEM; buf = orig_buf; err = tls_strp_msg_cow(sw_ctx); if (unlikely(err)) goto free_buf; skb = tls_strp_msg(sw_ctx); rxm = strp_msg(skb); offset = rxm->offset; sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv); err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv); if (err) goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, sg); if (err != -EBADMSG) goto free_buf; else err = 0; data_len = rxm->full_len - cipher_desc->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) { err = skb_store_bits(skb, offset, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; } pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { int frag_pos; /* Practically all frags must belong to msg if reencrypt * is needed with current strparser and coalescing logic, * but strparser may "get optimized", so let's be safe. */ if (pos + skb_iter->len <= offset) goto done_with_frag; if (pos >= data_len + rxm->offset) break; frag_pos = offset - pos; copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); if (skb_iter->decrypted) { err = skb_store_bits(skb_iter, frag_pos, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; done_with_frag: pos += skb_iter->len; } free_buf: kfree(orig_buf); return err; } int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb = tls_strp_msg(sw_ctx); struct strp_msg *rxm = strp_msg(skb); int is_decrypted, is_encrypted; if (!tls_strp_msg_mixed_decrypted(sw_ctx)) { is_decrypted = skb->decrypted; is_encrypted = !is_decrypted; } else { is_decrypted = 0; is_encrypted = 0; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, tls_ctx->rx.rec_seq, rxm->full_len, is_encrypted, is_decrypted); if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) { if (likely(is_encrypted || is_decrypted)) return is_decrypted; /* After tls_device_down disables the offload, the next SKB will * likely have initial fragments decrypted, and final ones not * decrypted. We need to reencrypt that single SKB. */ return tls_device_reencrypt(sk, tls_ctx); } /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ if (is_decrypted) { ctx->resync_nh_reset = 1; return is_decrypted; } if (is_encrypted) { tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); return 0; } ctx->resync_nh_reset = 1; return tls_device_reencrypt(sk, tls_ctx); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, struct net_device *netdev) { if (sk->sk_destruct != tls_device_sk_destruct) { refcount_set(&ctx->refcount, 1); dev_hold(netdev); RCU_INIT_POINTER(ctx->netdev, netdev); spin_lock_irq(&tls_device_lock); list_add_tail(&ctx->list, &tls_device_list); spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *ctx) { struct tls_offload_context_tx *offload_ctx; __be64 rcd_sn; offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL); if (!offload_ctx) return NULL; INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task); INIT_LIST_HEAD(&offload_ctx->records_list); spin_lock_init(&offload_ctx->lock); sg_init_table(offload_ctx->sg_tx_data, ARRAY_SIZE(offload_ctx->sg_tx_data)); /* start at rec_seq - 1 to account for the start marker record */ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; offload_ctx->ctx = ctx; return offload_ctx; } int tls_set_device_offload(struct sock *sk) { struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; const struct tls_cipher_desc *cipher_desc; struct tls_crypto_info *crypto_info; struct tls_prot_info *prot; struct net_device *netdev; struct tls_context *ctx; char *iv, *rec_seq; int rc; ctx = tls_get_ctx(sk); prot = &ctx->prot_info; if (ctx->priv_ctx_tx) return -EEXIST; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_TX)) { rc = -EOPNOTSUPP; goto release_netdev; } crypto_info = &ctx->crypto_send.info; if (crypto_info->version != TLS_1_2_VERSION) { rc = -EOPNOTSUPP; goto release_netdev; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc || !cipher_desc->offloadable) { rc = -EINVAL; goto release_netdev; } rc = init_prot_info(prot, crypto_info, cipher_desc); if (rc) goto release_netdev; iv = crypto_info_iv(crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq); start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); if (!start_marker_record) { rc = -ENOMEM; goto release_netdev; } offload_ctx = alloc_offload_ctx_tx(ctx); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; } rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) goto free_offload_ctx; start_marker_record->end_seq = tcp_sk(sk)->write_seq; start_marker_record->len = 0; start_marker_record->num_frags = 0; list_add_tail(&start_marker_record->list, &offload_ctx->records_list); clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. * So mark the last skb in the write queue as end of record. */ tcp_write_collapse_fence(sk); /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); /* following this assignment tls_is_skb_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); dev_put(netdev); return 0; release_lock: up_read(&device_offload_lock); clean_acked_data_disable(inet_csk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); ctx->priv_ctx_tx = NULL; free_marker_record: kfree(start_marker_record); release_netdev: dev_put(netdev); return rc; } int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) { rc = -ENOMEM; goto release_lock; } context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, 0, NULL); if (rc) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); info = (void *)&ctx->crypto_recv.info; trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); dev_put(netdev); return 0; free_sw_resources: up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_lock: up_read(&device_offload_lock); release_netdev: dev_put(netdev); return rc; } void tls_device_offload_cleanup_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct net_device *netdev; down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (!netdev) goto out; netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); rcu_assign_pointer(tls_ctx->netdev, NULL); } else { set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); tls_sw_release_resources_rx(sk); } static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; unsigned long flags; LIST_HEAD(list); /* Request a write lock to block new offload attempts */ down_write(&device_offload_lock); spin_lock_irqsave(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { struct net_device *ctx_netdev = rcu_dereference_protected(ctx->netdev, lockdep_is_held(&device_offload_lock)); if (ctx_netdev != netdev || !refcount_inc_not_zero(&ctx->refcount)) continue; list_move(&ctx->list, &list); } spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { /* Stop offloaded TX and switch to the fallback. * tls_is_skb_tx_device_offloaded will return false. */ WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw); /* Stop the RX and TX resync. * tls_dev_resync must not be called after tls_dev_del. */ rcu_assign_pointer(ctx->netdev, NULL); /* Start skipping the RX resync logic completely. */ set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags); /* Sync with inflight packets. After this point: * TX: no non-encrypted packets will be passed to the driver. * RX: resync requests from the driver will be ignored. */ synchronize_net(); /* Release the offload context on the driver side. */ if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); if (ctx->rx_conf == TLS_HW && !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); dev_put(netdev); /* Move the context to a separate list for two reasons: * 1. When the context is deallocated, list_del is called. * 2. It's no longer an offloaded context, so we don't want to * run offload-specific code on this context. */ spin_lock_irqsave(&tls_device_lock, flags); list_move_tail(&ctx->list, &tls_device_down_list); spin_unlock_irqrestore(&tls_device_lock, flags); /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. * Now release the ref taken above. */ if (refcount_dec_and_test(&ctx->refcount)) { /* sk_destruct ran after tls_device_down took a ref, and * it returned early. Complete the destruction here. */ list_del(&ctx->list); tls_device_free_ctx(ctx); } } up_write(&device_offload_lock); flush_workqueue(destruct_wq); return NOTIFY_DONE; } static int tls_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!dev->tlsdev_ops && !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if (netif_is_bond_master(dev)) return NOTIFY_DONE; if ((dev->features & NETIF_F_HW_TLS_RX) && !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) return NOTIFY_DONE; else return NOTIFY_BAD; case NETDEV_DOWN: return tls_device_down(dev); } return NOTIFY_DONE; } static struct notifier_block tls_dev_notifier = { .notifier_call = tls_dev_event, }; int __init tls_device_init(void) { int err; dummy_page = alloc_page(GFP_KERNEL); if (!dummy_page) return -ENOMEM; destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; } err = register_netdevice_notifier(&tls_dev_notifier); if (err) goto err_destroy_wq; return 0; err_destroy_wq: destroy_workqueue(destruct_wq); err_free_dummy: put_page(dummy_page); return err; } void __exit tls_device_cleanup(void) { unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); put_page(dummy_page); }
3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __KVM_ARM_PSCI_H__ #define __KVM_ARM_PSCI_H__ #include <linux/kvm_host.h> #include <uapi/linux/psci.h> #define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) #define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1) #define KVM_ARM_PSCI_1_2 PSCI_VERSION(1, 2) #define KVM_ARM_PSCI_1_3 PSCI_VERSION(1, 3) #define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_3 static inline int kvm_psci_version(struct kvm_vcpu *vcpu) { /* * Our PSCI implementation stays the same across versions from * v0.2 onward, only adding the few mandatory functions (such * as FEATURES with 1.0) that are required by newer * revisions. It is thus safe to return the latest, unless * userspace has instructed us otherwise. */ if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) { if (vcpu->kvm->arch.psci_version) return vcpu->kvm->arch.psci_version; return KVM_ARM_PSCI_LATEST; } return KVM_ARM_PSCI_0_1; } int kvm_psci_call(struct kvm_vcpu *vcpu); #endif /* __KVM_ARM_PSCI_H__ */
176 174 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fcntl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> #include <linux/uaccess.h> #include "internal.h" #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. */ if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ if (O_NONBLOCK != O_NDELAY) if (arg & O_NDELAY) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT) && !(filp->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); if (error) return error; /* * ->fasync() is responsible for setting the FASYNC bit. */ if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); if (error < 0) goto out; if (error > 0) error = 0; } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: return error; } /* * Allocate an file->f_owner struct if it doesn't exist, handling racing * allocations correctly. */ int file_f_owner_allocate(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) return 0; f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); if (!f_owner) return -ENOMEM; rwlock_init(&f_owner->lock); f_owner->file = file; /* If someone else raced us, drop our allocation. */ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) kfree(f_owner); return 0; } EXPORT_SYMBOL(file_f_owner_allocate); void file_f_owner_release(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) { put_pid(f_owner->pid); kfree(f_owner); } } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (WARN_ON_ONCE(!f_owner)) return; write_lock_irq(&f_owner->lock); if (force || !f_owner->pid) { put_pid(f_owner->pid); f_owner->pid = get_pid(pid); f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); security_file_set_fowner(filp); f_owner->uid = cred->uid; f_owner->euid = cred->euid; } } write_unlock_irq(&f_owner->lock); } EXPORT_SYMBOL(__f_setown); int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; int ret = 0; might_sleep(); type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ if (who == INT_MIN) return -EINVAL; type = PIDTYPE_PGID; who = -who; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); if (who) { pid = find_vpid(who); if (!pid) ret = -ESRCH; } if (!ret) __f_setown(filp, pid, type, force); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { __f_setown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) { pid_t pid = 0; struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (!f_owner) return pid; read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) { pid = pid_vnr(f_owner->pid); if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); read_unlock_irq(&f_owner->lock); return pid; } static int f_setown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner; struct pid *pid; int type; int ret; ret = copy_from_user(&owner, owner_p, sizeof(owner)); if (ret) return -EFAULT; switch (owner.type) { case F_OWNER_TID: type = PIDTYPE_PID; break; case F_OWNER_PID: type = PIDTYPE_TGID; break; case F_OWNER_PGRP: type = PIDTYPE_PGID; break; default: return -EINVAL; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) ret = -ESRCH; else __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; } static int f_getown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; struct fown_struct *f_owner; enum pid_type pid_type = PIDTYPE_PID; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) owner.pid = pid_vnr(f_owner->pid); rcu_read_unlock(); pid_type = f_owner->pid_type; } switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; case PIDTYPE_TGID: owner.type = F_OWNER_PID; break; case PIDTYPE_PGID: owner.type = F_OWNER_PGRP; break; default: WARN_ON(1); ret = -EINVAL; break; } if (f_owner) read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); if (ret) ret = -EFAULT; } return ret; } #ifdef CONFIG_CHECKPOINT_RESTORE static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; uid_t src[2] = {0, 0}; int err; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); src[0] = from_kuid(user_ns, f_owner->uid); src[1] = from_kuid(user_ns, f_owner->euid); read_unlock_irq(&f_owner->lock); } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); return err; } #else static int f_getowner_uids(struct file *filp, unsigned long arg) { return -EINVAL; } #endif static bool rw_hint_valid(u64 hint) { BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: case RWH_WRITE_LIFE_SHORT: case RWH_WRITE_LIFE_MEDIUM: case RWH_WRITE_LIFE_LONG: case RWH_WRITE_LIFE_EXTREME: return true; default: return false; } } static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint = READ_ONCE(inode->i_write_hint); if (copy_to_user(argp, &hint, sizeof(*argp))) return -EFAULT; return 0; } static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; if (copy_from_user(&hint, argp, sizeof(hint))) return -EFAULT; if (!rw_hint_valid(hint)) return -EINVAL; WRITE_ONCE(inode->i_write_hint, hint); /* * file->f_mapping->host may differ from inode. As an example, * blkdev_open() modifies file->f_mapping. */ if (file->f_mapping->host != inode) WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); return 0; } /* Is the file descriptor a dup of the file? */ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. * * Technically we didn't need a ref at all, and 'fdget()' was * overkill, but given our lockless file pointer lookup, the * alternatives are complicated. */ return fd_file(f) == filp; } /* Let the caller figure out whether a given file was just created. */ static long f_created_query(const struct file *filp) { return !!(filp->f_mode & FMODE_CREATED); } static int f_owner_sig(struct file *filp, int signum, bool setsig) { int ret = 0; struct fown_struct *f_owner; might_sleep(); if (setsig) { if (!valid_signal(signum)) return -EINVAL; ret = file_f_owner_allocate(filp); if (ret) return ret; } f_owner = file_f_owner(filp); if (setsig) f_owner->signum = signum; else if (f_owner) ret = f_owner->signum; return ret; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_CREATED_QUERY: err = f_created_query(filp); break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_DUPFD_QUERY: err = f_dupfd_query(argi, filp); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_GETLK: #endif case F_GETLK: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_getlk(filp, cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_SETLK: case F_OFD_SETLKW: fallthrough; #endif case F_SETLK: case F_SETLKW: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* * XXX If f_owner is a process group, the * negative return value will get converted * into an error. Oops. If we keep the * current syscall conventions, the only way * to fix this will be in libc. */ err = f_getown(filp); force_successful_syscall_return(); break; case F_SETOWN: err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); break; case F_SETOWN_EX: err = f_setown_ex(filp, arg); break; case F_GETOWNER_UIDS: err = f_getowner_uids(filp, arg); break; case F_GETSIG: err = f_owner_sig(filp, 0, false); break; case F_SETSIG: err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: err = fcntl_get_rw_hint(filp, cmd, arg); break; case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; } return err; } static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: case F_GETFD: case F_SETFD: case F_GETFL: return 1; } return 0; } SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd_raw, f)(fd); long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) err = do_fcntl(fd, cmd, arg, fd_file(f)); return err; } #if BITS_PER_LONG == 32 SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; CLASS(fd_raw, f)(fd); struct flock64 flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK64: case F_OFD_GETLK: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_getlk64(fd_file(f), cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_setlk64(fd, fd_file(f), cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } #endif #ifdef CONFIG_COMPAT /* careful - don't use anywhere else */ #define copy_flock_fields(dst, src) \ (dst)->l_type = (src)->l_type; \ (dst)->l_whence = (src)->l_whence; \ (dst)->l_start = (src)->l_start; \ (dst)->l_len = (src)->l_len; \ (dst)->l_pid = (src)->l_pid; static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl) { struct compat_flock fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl) { struct compat_flock64 fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl) { struct compat_flock fl; memset(&fl, 0, sizeof(struct compat_flock)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl) { struct compat_flock64 fl; BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start)); BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len)); memset(&fl, 0, sizeof(struct compat_flock64)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } #undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) { switch (cmd) { case F_GETLK64: return F_GETLK; case F_SETLK64: return F_SETLK; case F_SETLKW64: return F_SETLKW; } return cmd; } /* * GETLK was successful and we need to return the data, but it needs to fit in * the compat structure. * l_start shouldn't be too big, unless the original start + end is greater than * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return * -EOVERFLOW in that case. l_len could be too big, in which case we just * truncate it, and only allow the app to see that part of the conflicting lock * that might make sense to it anyway */ static int fixup_compat_flock(struct flock *flock) { if (flock->l_start > COMPAT_OFF_T_MAX) return -EOVERFLOW; if (flock->l_len > COMPAT_OFF_T_MAX) flock->l_len = COMPAT_OFF_T_MAX; return 0; } static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { CLASS(fd_raw, f)(fd); struct flock flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (err) break; err = fixup_compat_flock(&flock); if (!err) err = put_compat_flock(&flock, compat_ptr(arg)); break; case F_GETLK64: case F_OFD_GETLK: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (!err) err = put_compat_flock64(&flock, compat_ptr(arg)); break; case F_SETLK: case F_SETLKW: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { return do_compat_fcntl64(fd, cmd, arg); } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { switch (cmd) { case F_GETLK64: case F_SETLK64: case F_SETLKW64: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: return -EINVAL; } return do_compat_fcntl64(fd, cmd, arg); } #endif /* Table to convert sigio signal codes into poll band bitmaps */ static const __poll_t band_table[NSIGPOLL] = { EPOLLIN | EPOLLRDNORM, /* POLL_IN */ EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND, /* POLL_OUT */ EPOLLIN | EPOLLRDNORM | EPOLLMSG, /* POLL_MSG */ EPOLLERR, /* POLL_ERR */ EPOLLPRI | EPOLLRDBAND, /* POLL_PRI */ EPOLLHUP | EPOLLERR /* POLL_HUP */ }; static inline int sigio_perm(struct task_struct *p, struct fown_struct *fown, int sig) { const struct cred *cred; int ret; rcu_read_lock(); cred = __task_cred(p); ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); rcu_read_unlock(); return ret; } static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, int fd, int reason, enum pid_type type) { /* * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; switch (signum) { default: { kernel_siginfo_t si; /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* * Posix definies POLL_IN and friends to be signal * specific si_codes for SIG_POLL. Linux extended * these si_codes to other signals in a way that is * ambiguous if other signals also have signal * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; } fallthrough; /* fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } } void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; enum pid_type type; unsigned long flags; struct pid *pid; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigio_to_task(p, fown, fd, band, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigio_to_task(p, fown, fd, band, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); } static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } int send_sigurg(struct file *file) { struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; fown = file_f_owner(file); if (!fown) return 0; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; ret = 1; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigurg_to_task(p, fown, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigurg_to_task(p, fown, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); return ret; } static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, * do nothing and return 0. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". * */ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; int result = 0; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; } spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } struct fasync_struct *fasync_alloc(void) { return kmem_cache_alloc(fasync_cache, GFP_KERNEL); } /* * NOTE! This can be used only for unused fasync entries: * entries that actually got inserted on the fasync list * need to be released by rcu - see fasync_remove_entry. */ void fasync_free(struct fasync_struct *new) { kmem_cache_free(fasync_cache, new); } /* * Insert a new entry into the fasync list. Return the pointer to the * old one if we didn't use the new one. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". */ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) { struct fasync_struct *fa, **fp; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; write_unlock_irq(&fa->fa_lock); goto out; } rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; new->fa_next = *fapp; rcu_assign_pointer(*fapp, new); filp->f_flags |= FASYNC; out: spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return fa; } /* * Add a fasync entry. Return negative on error, positive if * added, and zero if did nothing but change an existing one. */ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *new; new = fasync_alloc(); if (!new) return -ENOMEM; /* * fasync_insert_entry() returns the old (update) entry if * it existed. * * So free the (unused) new entry and return 0 to let the * caller know that we didn't add any new fasync entries. */ if (fasync_insert_entry(fd, filp, fapp, new)) { fasync_free(new); return 0; } return 1; } /* * fasync_helper() is used by almost all character device drivers * to set up the fasync queue, and for regular files by the file * lease code. It returns negative on error, 0 if it did no changes * and positive if it added/deleted the entry. */ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) { if (!on) return fasync_remove_entry(filp, fapp); return fasync_add_entry(fd, filp, fapp); } EXPORT_SYMBOL(fasync_helper); /* * rcu_read_lock() is held */ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = file_f_owner(fa->fa_file); if (!fown) goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } void kill_fasync(struct fasync_struct **fp, int sig, int band) { /* First a quick test without locking: usually * the list is empty. */ if (*fp) { rcu_read_lock(); kill_fasync_rcu(rcu_dereference(*fp), sig, band); rcu_read_unlock(); } } EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { /* * Please add new bits here to ensure allocation uniqueness. * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | __FMODE_EXEC)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } module_init(fcntl_init)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct ptr_ring' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * This is a limited-size FIFO maintaining pointers in FIFO order, with * one CPU producing entries and another consuming entries from a FIFO. * * This implementation tries to minimize cache-contention when there is a * single producer and a single consumer CPU. */ #ifndef _LINUX_PTR_RING_H #define _LINUX_PTR_RING_H 1 #ifdef __KERNEL__ #include <linux/spinlock.h> #include <linux/cache.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/mm.h> #include <asm/errno.h> #endif struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ int batch; /* number of entries to consume in a batch */ void **queue; }; /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). * * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock: * see e.g. ptr_ring_full. */ static inline bool __ptr_ring_full(struct ptr_ring *r) { return r->queue[r->producer]; } static inline bool ptr_ring_full(struct ptr_ring *r) { bool ret; spin_lock(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock(&r->producer_lock); return ret; } static inline bool ptr_ring_full_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_irq(&r->producer_lock); return ret; } static inline bool ptr_ring_full_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_full(r); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline bool ptr_ring_full_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_bh(&r->producer_lock); return ret; } /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. * Callers are responsible for making sure pointer that is being queued * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); if (unlikely(r->producer >= r->size)) r->producer = 0; return 0; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * consume in interrupt or BH context, you must disable interrupts/BH when * calling this. */ static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) { int ret; spin_lock(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock(&r->producer_lock); return ret; } static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) { int ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_irq(&r->producer_lock); return ret; } static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) { unsigned long flags; int ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_produce(r, ptr); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) { int ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_bh(&r->producer_lock); return ret; } static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) return READ_ONCE(r->queue[r->consumer_head]); return NULL; } /* * Test ring empty status without taking any locks. * * NB: This is only safe to call if ring is never resized. * * However, if some other CPU consumes ring entries at the same time, the value * returned is not guaranteed to be correct. * * In this case - to avoid incorrectly detecting the ring * as empty - the CPU consuming the ring entries is responsible * for either consuming all ring entries until the ring is empty, * or synchronizing with some other CPU and causing it to * re-test __ptr_ring_empty and/or consume the ring enteries * after the synchronization point. * * Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). */ static inline bool __ptr_ring_empty(struct ptr_ring *r) { if (likely(r->size)) return !r->queue[READ_ONCE(r->consumer_head)]; return true; } static inline bool ptr_ring_empty(struct ptr_ring *r) { bool ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_irq(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_empty(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline bool ptr_ring_empty_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_bh(&r->consumer_lock); return ret; } /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { /* Fundamentally, what we want to do is update consumer * index and zero out the entry so producer can reuse it. * Doing it naively at each consume would be as simple as: * consumer = r->consumer; * r->queue[consumer++] = NULL; * if (unlikely(consumer >= r->size)) * consumer = 0; * r->consumer = consumer; * but that is suboptimal when the ring is full as producer is writing * out new entries in the same cache line. Defer these updates until a * batch of entries has been consumed. */ /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ int consumer_head = r->consumer_head; int head = consumer_head++; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. * We also do this when we reach end of the ring - not mandatory * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || consumer_head >= r->size)) { /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = consumer_head; } if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; } /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, consumer_head); } static inline void *__ptr_ring_consume(struct ptr_ring *r) { void *ptr; /* The READ_ONCE in __ptr_ring_peek guarantees that anyone * accessing data through the pointer is up to date. Pairs * with smp_wmb in __ptr_ring_produce. */ ptr = __ptr_ring_peek(r); if (ptr) __ptr_ring_discard_one(r); return ptr; } static inline int __ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { void *ptr; int i; for (i = 0; i < n; i++) { ptr = __ptr_ring_consume(r); if (!ptr) break; array[i] = ptr; } return i; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * call this in interrupt or BH context, you must disable interrupts/BH when * producing. */ static inline void *ptr_ring_consume(struct ptr_ring *r) { void *ptr; spin_lock(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_irq(struct ptr_ring *r) { void *ptr; spin_lock_irq(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_irq(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_any(struct ptr_ring *r) { unsigned long flags; void *ptr; spin_lock_irqsave(&r->consumer_lock, flags); ptr = __ptr_ring_consume(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ptr; } static inline void *ptr_ring_consume_bh(struct ptr_ring *r) { void *ptr; spin_lock_bh(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_bh(&r->consumer_lock); return ptr; } static inline int ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { int ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irq(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, void **array, int n) { unsigned long flags; int ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_bh(&r->consumer_lock); return ret; } /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. */ #define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) #define PTR_RING_PEEK_CALL(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_BH(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ unsigned long __PTR_RING_PEEK_CALL_f;\ \ spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v; \ }) /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See * documentation for vmalloc for which of them are legal. */ static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp) { if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) { r->size = size; r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); /* We need to set batch at least to 1 to make logic * in __ptr_ring_discard_one work correctly. * Batching too much (because ring is small) would cause a lot of * burstiness. Needs tuning, for now disable batching. */ if (r->batch > r->size / 2 || !r->batch) r->batch = 1; } static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!r->queue) return -ENOMEM; __ptr_ring_set_size(r, size); r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); return 0; } #define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__)) /* * Return entries into ring. Destroy entries that don't fit. * * Note: this is expected to be a rare slow path operation. * * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; int head; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); if (!r->size) goto done; /* * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ head = r->consumer_head - 1; while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = r->consumer_head; /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { /* This batch entry will have to be destroyed. */ goto done; } r->queue[head] = batch[--n]; r->consumer_tail = head; /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, head); } done: /* Destroy all entries left in the batch. */ while (n) destroy(batch[--n]); spin_unlock(&r->producer_lock); spin_unlock_irqrestore(&r->consumer_lock, flags); } static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, int size, gfp_t gfp, void (*destroy)(void *)) { int producer = 0; void **old; void *ptr; while ((ptr = __ptr_ring_consume(r))) if (producer < size) queue[producer++] = ptr; else if (destroy) destroy(ptr); if (producer >= size) producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; r->consumer_tail = 0; old = r->queue; r->queue = queue; return old; } /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); void **old; if (!queue) return -ENOMEM; spin_lock_irqsave(&(r)->consumer_lock, flags); spin_lock(&(r)->producer_lock); old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy); spin_unlock(&(r)->producer_lock); spin_unlock_irqrestore(&(r)->consumer_lock, flags); kvfree(old); return 0; } #define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__)) /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in BH context, you must * disable BH when doing so. */ static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { void ***queues; int i; queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; for (i = 0; i < nrings; ++i) { queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!queues[i]) goto nomem; } for (i = 0; i < nrings; ++i) { spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) kvfree(queues[i]); kfree(queues); return 0; nomem: while (--i >= 0) kvfree(queues[i]); kfree(queues); noqueues: return -ENOMEM; } #define ptr_ring_resize_multiple_bh(...) \ alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { void *ptr; if (destroy) while ((ptr = ptr_ring_consume(r))) destroy(ptr); kvfree(r->queue); } #endif /* _LINUX_PTR_RING_H */
2 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); goto drop; } } if (ip_options_rcv_srr(skb, dev)) goto drop; } return false; drop: return true; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int err, drop_reason; struct rtable *rt; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev, hint); if (unlikely(drop_reason)) goto drop_error; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { err = udp_v4_early_demux(skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); }
8 7 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <linux/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { u8 addr[ETH_ALEN]; eth_random_addr(addr); __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_set - Assign Ethernet address to a net_device * @dev: pointer to net_device structure * @addr: address to assign * * Assign given address to the net_device, addr_assign_type is not changed. */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, ETH_ALEN); } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; eth_hw_addr_set(dst, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask); } static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask); } static inline bool ether_addr_is_ip_mcast(const u8 *addr) { return ether_addr_is_ipv4_mcast(addr) || ether_addr_is_ipv6_mcast(addr); } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address. * * @offset: Offset to add. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_add(u8 *addr, long offset) { u64 u = ether_addr_to_u64(addr); u += offset; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_hw_addr_gen - Generate and assign Ethernet address to a port * @dev: pointer to port's net_device structure * @base_addr: base Ethernet address * @id: offset to add to the base address * * Generate a MAC address using a base address and an offset and assign it * to a net_device. Commonly used by switch drivers which need to compute * addresses for all their ports. addr_assign_type is not changed. */ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id) { u64 u = ether_addr_to_u64(base_addr); u8 addr[ETH_ALEN]; u += id; u64_to_ether_addr(u, addr); eth_hw_addr_set(dev, addr); } /** * eth_skb_pkt_type - Assign packet type if destination address does not match * @skb: Assigned a packet type if address does not match @dev address * @dev: Network device used to compare packet address against * * If the destination MAC address of the packet does not match the network * device address, assign an appropriate packet type. */ static inline void eth_skb_pkt_type(struct sk_buff *skb, const struct net_device *dev) { const struct ethhdr *eth = eth_hdr(skb); if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { skb->pkt_type = PACKET_OTHERHOST; } } } static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); return eth; } /** * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ipi #if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IPI_H #include <linux/tracepoint.h> /** * ipi_raise - called when a smp cross call is made * * @mask: mask of recipient CPUs for the IPI * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string. */ TRACE_EVENT(ipi_raise, TP_PROTO(const struct cpumask *mask, const char *reason), TP_ARGS(mask, reason), TP_STRUCT__entry( __bitmask(target_cpus, nr_cpumask_bits) __field(const char *, reason) ), TP_fast_assign( __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits); __entry->reason = reason; ), TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason) ); TRACE_EVENT(ipi_send_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback), TP_ARGS(cpu, callsite, callback), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, callback) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->callback = callback; ), TP_printk("cpu=%u callsite=%pS callback=%pS", __entry->cpu, __entry->callsite, __entry->callback) ); TRACE_EVENT(ipi_send_cpumask, TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback), TP_ARGS(cpumask, callsite, callback), TP_STRUCT__entry( __cpumask(cpumask) __field(void *, callsite) __field(void *, callback) ), TP_fast_assign( __assign_cpumask(cpumask, cpumask_bits(cpumask)); __entry->callsite = (void *)callsite; __entry->callback = callback; ), TP_printk("cpumask=%s callsite=%pS callback=%pS", __get_cpumask(cpumask), __entry->callsite, __entry->callback) ); DECLARE_EVENT_CLASS(ipi_handler, TP_PROTO(const char *reason), TP_ARGS(reason), TP_STRUCT__entry( __field(const char *, reason) ), TP_fast_assign( __entry->reason = reason; ), TP_printk("(%s)", __entry->reason) ); /** * ipi_entry - called immediately before the IPI handler * * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string, ideally the same as used with trace_ipi_raise * for that IPI. */ DEFINE_EVENT(ipi_handler, ipi_entry, TP_PROTO(const char *reason), TP_ARGS(reason) ); /** * ipi_exit - called immediately after the IPI handler returns * * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string, ideally the same as used with trace_ipi_raise for * that IPI. */ DEFINE_EVENT(ipi_handler, ipi_exit, TP_PROTO(const char *reason), TP_ARGS(reason) ); #endif /* _TRACE_IPI_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
865 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fault-inject.h> #include <linux/fault-inject-usercopy.h> static struct { struct fault_attr attr; } fail_usercopy = { .attr = FAULT_ATTR_INITIALIZER, }; static int __init setup_fail_usercopy(char *str) { return setup_fault_attr(&fail_usercopy.attr, str); } __setup("fail_usercopy=", setup_fail_usercopy); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_usercopy_debugfs(void) { struct dentry *dir; dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); if (IS_ERR(dir)) return PTR_ERR(dir); return 0; } late_initcall(fail_usercopy_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ bool should_fail_usercopy(void) { return should_fail(&fail_usercopy.attr, 1); } EXPORT_SYMBOL_GPL(should_fail_usercopy);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 // SPDX-License-Identifier: GPL-2.0-only /* * VFIO core * * Copyright (C) 2012 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com> * * Derived from original vfio: * Copyright 2010 Cisco Systems, Inc. All rights reserved. * Author: Tom Lyon, pugs@cisco.com */ #include <linux/vfio.h> #include <linux/iommufd.h> #include <linux/anon_inodes.h> #include "vfio.h" static struct vfio { struct class *class; struct list_head group_list; struct mutex group_lock; /* locks group_list */ struct ida group_ida; dev_t group_devt; } vfio; static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, char *buf) { struct vfio_device *it, *device = ERR_PTR(-ENODEV); mutex_lock(&group->device_lock); list_for_each_entry(it, &group->device_list, group_next) { int ret; if (it->ops->match) { ret = it->ops->match(it, buf); if (ret < 0) { device = ERR_PTR(ret); break; } } else { ret = !strcmp(dev_name(it->dev), buf); } if (ret && vfio_device_try_get_registration(it)) { device = it; break; } } mutex_unlock(&group->device_lock); return device; } /* * VFIO Group fd, /dev/vfio/$GROUP */ static bool vfio_group_has_iommu(struct vfio_group *group) { lockdep_assert_held(&group->group_lock); /* * There can only be users if there is a container, and if there is a * container there must be users. */ WARN_ON(!group->container != !group->container_users); return group->container || group->iommufd; } /* * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or * if there was no container to unset. Since the ioctl is called on * the group, we know that still exists, therefore the only valid * transition here is 1->0. */ static int vfio_group_ioctl_unset_container(struct vfio_group *group) { int ret = 0; mutex_lock(&group->group_lock); if (!vfio_group_has_iommu(group)) { ret = -EINVAL; goto out_unlock; } if (group->container) { if (group->container_users != 1) { ret = -EBUSY; goto out_unlock; } vfio_group_detach_container(group); } if (group->iommufd) { iommufd_ctx_put(group->iommufd); group->iommufd = NULL; } out_unlock: mutex_unlock(&group->group_lock); return ret; } static int vfio_group_ioctl_set_container(struct vfio_group *group, int __user *arg) { struct vfio_container *container; struct iommufd_ctx *iommufd; int ret; int fd; if (get_user(fd, arg)) return -EFAULT; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; mutex_lock(&group->group_lock); if (vfio_group_has_iommu(group)) { ret = -EINVAL; goto out_unlock; } if (!group->iommu_group) { ret = -ENODEV; goto out_unlock; } container = vfio_container_from_file(fd_file(f)); if (container) { ret = vfio_container_attach_group(container, group); goto out_unlock; } iommufd = iommufd_ctx_from_file(fd_file(f)); if (!IS_ERR(iommufd)) { if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && group->type == VFIO_NO_IOMMU) ret = iommufd_vfio_compat_set_no_iommu(iommufd); else ret = iommufd_vfio_compat_ioas_create(iommufd); if (ret) { iommufd_ctx_put(iommufd); goto out_unlock; } group->iommufd = iommufd; goto out_unlock; } /* The FD passed is not recognized. */ ret = -EBADFD; out_unlock: mutex_unlock(&group->group_lock); return ret; } static void vfio_device_group_get_kvm_safe(struct vfio_device *device) { spin_lock(&device->group->kvm_ref_lock); vfio_device_get_kvm_safe(device, device->group->kvm); spin_unlock(&device->group->kvm_ref_lock); } static int vfio_df_group_open(struct vfio_device_file *df) { struct vfio_device *device = df->device; int ret; mutex_lock(&device->group->group_lock); if (!vfio_group_has_iommu(device->group)) { ret = -EINVAL; goto out_unlock; } mutex_lock(&device->dev_set->lock); /* * Before the first device open, get the KVM pointer currently * associated with the group (if there is one) and obtain a reference * now that will be held until the open_count reaches 0 again. Save * the pointer in the device for use by drivers. */ if (device->open_count == 0) vfio_device_group_get_kvm_safe(device); df->iommufd = device->group->iommufd; if (df->iommufd && vfio_device_is_noiommu(device) && device->open_count == 0) { /* * Require no compat ioas to be assigned to proceed. The basic * statement is that the user cannot have done something that * implies they expected translation to exist */ if (!capable(CAP_SYS_RAWIO) || vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) ret = -EPERM; else ret = 0; goto out_put_kvm; } ret = vfio_df_open(df); if (ret) goto out_put_kvm; if (df->iommufd && device->open_count == 1) { ret = vfio_iommufd_compat_attach_ioas(device, df->iommufd); if (ret) goto out_close_device; } /* * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ * read/write/mmap and vfio_file_has_device_access() */ smp_store_release(&df->access_granted, true); mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); return 0; out_close_device: vfio_df_close(df); out_put_kvm: df->iommufd = NULL; if (device->open_count == 0) vfio_device_put_kvm(device); mutex_unlock(&device->dev_set->lock); out_unlock: mutex_unlock(&device->group->group_lock); return ret; } void vfio_df_group_close(struct vfio_device_file *df) { struct vfio_device *device = df->device; mutex_lock(&device->group->group_lock); mutex_lock(&device->dev_set->lock); vfio_df_close(df); df->iommufd = NULL; if (device->open_count == 0) vfio_device_put_kvm(device); mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); } static struct file *vfio_device_open_file(struct vfio_device *device) { struct vfio_device_file *df; struct file *filep; int ret; df = vfio_allocate_device_file(device); if (IS_ERR(df)) { ret = PTR_ERR(df); goto err_out; } df->group = device->group; ret = vfio_df_group_open(df); if (ret) goto err_free; /* * We can't use anon_inode_getfd() because we need to modify * the f_mode flags directly to allow more than just ioctls */ filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, df, O_RDWR); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_close_device; } /* * TODO: add an anon_inode interface to do this. * Appears to be missing by lack of need rather than * explicitly prevented. Now there's need. */ filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); /* * Use the pseudo fs inode on the device to link all mmaps * to the same address space, allowing us to unmap all vmas * associated to this device using unmap_mapping_range(). */ filep->f_mapping = device->inode->i_mapping; if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); /* * On success the ref of device is moved to the file and * put in vfio_device_fops_release() */ return filep; err_close_device: vfio_df_group_close(df); err_free: kfree(df); err_out: return ERR_PTR(ret); } static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, char __user *arg) { struct vfio_device *device; struct file *filep; char *buf; int fdno; int ret; buf = strndup_user(arg, PAGE_SIZE); if (IS_ERR(buf)) return PTR_ERR(buf); device = vfio_device_get_from_name(group, buf); kfree(buf); if (IS_ERR(device)) return PTR_ERR(device); fdno = get_unused_fd_flags(O_CLOEXEC); if (fdno < 0) { ret = fdno; goto err_put_device; } filep = vfio_device_open_file(device); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_put_fdno; } fd_install(fdno, filep); return fdno; err_put_fdno: put_unused_fd(fdno); err_put_device: vfio_device_put_registration(device); return ret; } static int vfio_group_ioctl_get_status(struct vfio_group *group, struct vfio_group_status __user *arg) { unsigned long minsz = offsetofend(struct vfio_group_status, flags); struct vfio_group_status status; if (copy_from_user(&status, arg, minsz)) return -EFAULT; if (status.argsz < minsz) return -EINVAL; status.flags = 0; mutex_lock(&group->group_lock); if (!group->iommu_group) { mutex_unlock(&group->group_lock); return -ENODEV; } /* * With the container FD the iommu_group_claim_dma_owner() is done * during SET_CONTAINER but for IOMMFD this is done during * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due * to viability. */ if (vfio_group_has_iommu(group)) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) status.flags |= VFIO_GROUP_FLAGS_VIABLE; mutex_unlock(&group->group_lock); if (copy_to_user(arg, &status, minsz)) return -EFAULT; return 0; } static long vfio_group_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_group *group = filep->private_data; void __user *uarg = (void __user *)arg; switch (cmd) { case VFIO_GROUP_GET_DEVICE_FD: return vfio_group_ioctl_get_device_fd(group, uarg); case VFIO_GROUP_GET_STATUS: return vfio_group_ioctl_get_status(group, uarg); case VFIO_GROUP_SET_CONTAINER: return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: return vfio_group_ioctl_unset_container(group); default: return -ENOTTY; } } int vfio_device_block_group(struct vfio_device *device) { struct vfio_group *group = device->group; int ret = 0; mutex_lock(&group->group_lock); if (group->opened_file) { ret = -EBUSY; goto out_unlock; } group->cdev_device_open_cnt++; out_unlock: mutex_unlock(&group->group_lock); return ret; } void vfio_device_unblock_group(struct vfio_device *device) { struct vfio_group *group = device->group; mutex_lock(&group->group_lock); group->cdev_device_open_cnt--; mutex_unlock(&group->group_lock); } static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group = container_of(inode->i_cdev, struct vfio_group, cdev); int ret; mutex_lock(&group->group_lock); /* * drivers can be zero if this races with vfio_device_remove_group(), it * will be stable at 0 under the group rwsem */ if (refcount_read(&group->drivers) == 0) { ret = -ENODEV; goto out_unlock; } if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { ret = -EPERM; goto out_unlock; } if (group->cdev_device_open_cnt) { ret = -EBUSY; goto out_unlock; } /* * Do we need multiple instances of the group open? Seems not. */ if (group->opened_file) { ret = -EBUSY; goto out_unlock; } group->opened_file = filep; filep->private_data = group; ret = 0; out_unlock: mutex_unlock(&group->group_lock); return ret; } static int vfio_group_fops_release(struct inode *inode, struct file *filep) { struct vfio_group *group = filep->private_data; filep->private_data = NULL; mutex_lock(&group->group_lock); /* * Device FDs hold a group file reference, therefore the group release * is only called when there are no open devices. */ WARN_ON(group->notifier.head); if (group->container) vfio_group_detach_container(group); if (group->iommufd) { iommufd_ctx_put(group->iommufd); group->iommufd = NULL; } group->opened_file = NULL; mutex_unlock(&group->group_lock); return 0; } static const struct file_operations vfio_group_fops = { .owner = THIS_MODULE, .unlocked_ioctl = vfio_group_fops_unl_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vfio_group_fops_open, .release = vfio_group_fops_release, }; /* * Group objects - create, release, get, put, search */ static struct vfio_group * vfio_group_find_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; lockdep_assert_held(&vfio.group_lock); /* * group->iommu_group from the vfio.group_list cannot be NULL * under the vfio.group_lock. */ list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) return group; } return NULL; } static void vfio_group_release(struct device *dev) { struct vfio_group *group = container_of(dev, struct vfio_group, dev); mutex_destroy(&group->device_lock); mutex_destroy(&group->group_lock); WARN_ON(group->iommu_group); WARN_ON(group->cdev_device_open_cnt); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); } static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_group *group; int minor; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); if (minor < 0) { kfree(group); return ERR_PTR(minor); } device_initialize(&group->dev); group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); group->dev.class = vfio.class; group->dev.release = vfio_group_release; cdev_init(&group->cdev, &vfio_group_fops); group->cdev.owner = THIS_MODULE; refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); spin_lock_init(&group->kvm_ref_lock); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; /* put in vfio_group_release() */ iommu_group_ref_get(iommu_group); group->type = type; BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); return group; } static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_group *group; struct vfio_group *ret; int err; lockdep_assert_held(&vfio.group_lock); group = vfio_group_alloc(iommu_group, type); if (IS_ERR(group)) return group; err = dev_set_name(&group->dev, "%s%d", group->type == VFIO_NO_IOMMU ? "noiommu-" : "", iommu_group_id(iommu_group)); if (err) { ret = ERR_PTR(err); goto err_put; } err = cdev_device_add(&group->cdev, &group->dev); if (err) { ret = ERR_PTR(err); goto err_put; } list_add(&group->vfio_next, &vfio.group_list); return group; err_put: put_device(&group->dev); return ret; } static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { struct iommu_group *iommu_group; struct vfio_group *group; int ret; iommu_group = iommu_group_alloc(); if (IS_ERR(iommu_group)) return ERR_CAST(iommu_group); ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); if (ret) goto out_put_group; ret = iommu_group_add_device(iommu_group, dev); if (ret) goto out_put_group; mutex_lock(&vfio.group_lock); group = vfio_create_group(iommu_group, type); mutex_unlock(&vfio.group_lock); if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_remove_device; } iommu_group_put(iommu_group); return group; out_remove_device: iommu_group_remove_device(dev); out_put_group: iommu_group_put(iommu_group); return ERR_PTR(ret); } static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) { struct vfio_device *device; mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { if (device->dev == dev) { mutex_unlock(&group->device_lock); return true; } } mutex_unlock(&group->device_lock); return false; } static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) { struct iommu_group *iommu_group; struct vfio_group *group; iommu_group = iommu_group_get(dev); if (!iommu_group && vfio_noiommu) { /* * With noiommu enabled, create an IOMMU group for devices that * don't already have one, implying no IOMMU hardware/driver * exists. Taint the kernel because we're about to give a DMA * capable device to a user without IOMMU protection. */ group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); if (!IS_ERR(group)) { add_taint(TAINT_USER, LOCKDEP_STILL_OK); dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); } return group; } if (!iommu_group) return ERR_PTR(-EINVAL); mutex_lock(&vfio.group_lock); group = vfio_group_find_from_iommu(iommu_group); if (group) { if (WARN_ON(vfio_group_has_device(group, dev))) group = ERR_PTR(-EINVAL); else refcount_inc(&group->drivers); } else { group = vfio_create_group(iommu_group, VFIO_IOMMU); } mutex_unlock(&vfio.group_lock); /* The vfio_group holds a reference to the iommu_group */ iommu_group_put(iommu_group); return group; } int vfio_device_set_group(struct vfio_device *device, enum vfio_group_type type) { struct vfio_group *group; if (type == VFIO_IOMMU) group = vfio_group_find_or_alloc(device->dev); else group = vfio_noiommu_group_alloc(device->dev, type); if (IS_ERR(group)) return PTR_ERR(group); /* Our reference on group is moved to the device */ device->group = group; return 0; } void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; struct iommu_group *iommu_group; if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) return; list_del(&group->vfio_next); /* * We could concurrently probe another driver in the group that might * race vfio_device_remove_group() with vfio_get_group(), so we have to * ensure that the sysfs is all cleaned up under lock otherwise the * cdev_device_add() will fail due to the name aready existing. */ cdev_device_del(&group->cdev, &group->dev); mutex_lock(&group->group_lock); /* * These data structures all have paired operations that can only be * undone when the caller holds a live reference on the device. Since * all pairs must be undone these WARN_ON's indicate some caller did not * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); WARN_ON(group->notifier.head); /* * Revoke all users of group->iommu_group. At this point we know there * are no devices active because we are unplugging the last one. Setting * iommu_group to NULL blocks all new users. */ if (group->container) vfio_group_detach_container(group); iommu_group = group->iommu_group; group->iommu_group = NULL; mutex_unlock(&group->group_lock); mutex_unlock(&vfio.group_lock); iommu_group_put(iommu_group); put_device(&group->dev); } void vfio_device_group_register(struct vfio_device *device) { mutex_lock(&device->group->device_lock); list_add(&device->group_next, &device->group->device_list); mutex_unlock(&device->group->device_lock); } void vfio_device_group_unregister(struct vfio_device *device) { mutex_lock(&device->group->device_lock); list_del(&device->group_next); mutex_unlock(&device->group->device_lock); } int vfio_device_group_use_iommu(struct vfio_device *device) { struct vfio_group *group = device->group; int ret = 0; lockdep_assert_held(&group->group_lock); if (WARN_ON(!group->container)) return -EINVAL; ret = vfio_group_use_container(group); if (ret) return ret; vfio_device_container_register(device); return 0; } void vfio_device_group_unuse_iommu(struct vfio_device *device) { struct vfio_group *group = device->group; lockdep_assert_held(&group->group_lock); if (WARN_ON(!group->container)) return; vfio_device_container_unregister(device); vfio_group_unuse_container(group); } bool vfio_device_has_container(struct vfio_device *device) { return device->group->container; } struct vfio_group *vfio_group_from_file(struct file *file) { struct vfio_group *group = file->private_data; if (file->f_op != &vfio_group_fops) return NULL; return group; } /** * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file * * The returned iommu_group is valid as long as a ref is held on the file. This * returns a reference on the group. This function is deprecated, only the SPAPR * path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = vfio_group_from_file(file); struct iommu_group *iommu_group = NULL; if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) return NULL; if (!group) return NULL; mutex_lock(&group->group_lock); if (group->iommu_group) { iommu_group = group->iommu_group; iommu_group_ref_get(iommu_group); } mutex_unlock(&group->group_lock); return iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); /** * vfio_file_is_group - True if the file is a vfio group file * @file: VFIO group file */ bool vfio_file_is_group(struct file *file) { return vfio_group_from_file(file); } EXPORT_SYMBOL_GPL(vfio_file_is_group); bool vfio_group_enforced_coherent(struct vfio_group *group) { struct vfio_device *device; bool ret = true; /* * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then * any domain later attached to it will also not support it. If the cap * is set then the iommu_domain eventually attached to the device/group * must use a domain with enforce_cache_coherency(). */ mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { if (!device_iommu_capable(device->dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { ret = false; break; } } mutex_unlock(&group->device_lock); return ret; } void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) { spin_lock(&group->kvm_ref_lock); group->kvm = kvm; spin_unlock(&group->kvm_ref_lock); } /** * vfio_file_has_dev - True if the VFIO file is a handle for device * @file: VFIO file to check * @device: Device that must be part of the file * * Returns true if given file has permission to manipulate the given device. */ bool vfio_file_has_dev(struct file *file, struct vfio_device *device) { struct vfio_group *group = vfio_group_from_file(file); if (!group) return false; return group == device->group; } EXPORT_SYMBOL_GPL(vfio_file_has_dev); static char *vfio_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } int __init vfio_group_init(void) { int ret; ida_init(&vfio.group_ida); mutex_init(&vfio.group_lock); INIT_LIST_HEAD(&vfio.group_list); ret = vfio_container_init(); if (ret) return ret; /* /dev/vfio/$GROUP */ vfio.class = class_create("vfio"); if (IS_ERR(vfio.class)) { ret = PTR_ERR(vfio.class); goto err_group_class; } vfio.class->devnode = vfio_devnode; ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); if (ret) goto err_alloc_chrdev; return 0; err_alloc_chrdev: class_destroy(vfio.class); vfio.class = NULL; err_group_class: vfio_container_cleanup(); return ret; } void vfio_group_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.class); vfio.class = NULL; vfio_container_cleanup(); }
5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" #define DEVLINK_PORT_FN_CAPS_VALID_MASK \ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 }, }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ WARN_ON_ONCE((devlink_port)->registered) struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { return xa_load(&devlink->ports, port_index); } struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return ERR_PTR(-ENODEV); return devlink_port; } return ERR_PTR(-EINVAL); } struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, u32 cap, bool is_enable) { caps->selector |= cap; if (is_enable) caps->value |= cap; } static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_roce_get) return 0; err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); return 0; } static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_migratable_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); return 0; } static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_crypto_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); return 0; } static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_packet_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); return 0; } static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { struct nla_bitfield32 caps = {}; int err; err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); if (err) return err; if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, caps.selector); if (err) return err; *msg_updated = true; return 0; } static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u32 max_io_eqs; int err; if (!port->ops->port_fn_max_io_eqs_get) return 0; err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs); if (err) return err; *msg_updated = true; return 0; } int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) return -EMSGSIZE; return 0; } size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (!devlink_port->attrs_set) return 0; if (attrs->lanes) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_pf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_vf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_sf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_sf.pf) || nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, attrs->phys.split_subport_number)) return -EMSGSIZE; break; default: break; } return 0; } static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; int err; if (!port->ops->port_fn_hw_addr_get) return 0; err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); if (err) return err; *msg_updated = true; return 0; } static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { return state == DEVLINK_PORT_FN_STATE_INACTIVE || state == DEVLINK_PORT_FN_STATE_ACTIVE; } static bool devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) { return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; int err; if (!port->ops->port_fn_state_get) return 0; err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) return -EMSGSIZE; *msg_updated = true; return 0; } static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_roce_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nla_bitfield32 caps; u32 caps_value; int err; caps = nla_get_bitfield32(attr); caps_value = caps.value & caps.selector; if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { err = devlink_port_fn_roce_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_ROCE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { err = devlink_port_fn_mig_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_MIGRATABLE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_PACKET, extack); if (err) return err; } return 0; } static int devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { u32 max_io_eqs; max_io_eqs = nla_get_u32(attr); return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port, max_io_eqs, extack); } static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct nlattr *function_attr; bool msg_updated = false; int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); return err; } static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; if (devlink_port->linecard && nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, devlink_linecard_index(devlink_port->linecard))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); devlink_nl_obj_desc_port_set(&desc, devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } static void devlink_ports_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_port *devlink_port; unsigned long port_index; xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, cmd); } void devlink_ports_notify_register(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); } void devlink_ports_notify_unregister(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); } int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; unsigned long port_index; int err = 0; xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err) { state->idx = port_index; break; } } return err; } int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); } static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; devlink_port->desired_type = port_type; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { const u8 *hw_addr; int hw_addr_len; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; state = nla_get_u8(attr); return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; } if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE], "Function does not support state setting"); return -EOPNOTSUPP; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { struct nla_bitfield32 caps; caps = nla_get_bitfield32(attr); if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && !ops->port_fn_roce_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { if (!ops->port_fn_migratable_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support migratable function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "migratable function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { if (!ops->port_fn_ipsec_crypto_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_crypto function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_crypto function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { if (!ops->port_fn_ipsec_packet_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_packet function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_packet function attribute supported for VFs only"); return -EOPNOTSUPP; } } } if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] && !ops->port_fn_max_io_eqs_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS], "Function does not support max_io_eqs setting"); return -EOPNOTSUPP; } return 0; } static int devlink_port_function_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } err = devlink_port_function_validate(port, tb, extack); if (err) return err; attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { err = devlink_port_fn_caps_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS]; if (attr) { err = devlink_port_fn_max_io_eqs_set(port, attr, extack); if (err) return err; } /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } return 0; } int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; u32 count; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } return devlink_port->ops->port_split(devlink, devlink_port, count, info->extack); } int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; struct devlink_port *devlink_port; struct sk_buff *msg; int err; if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); new_attrs.pfnum = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { /* Port index of the new port being created by driver. */ new_attrs.port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); new_attrs.port_index_valid = true; } if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { new_attrs.controller = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); new_attrs.controller_valid = true; } if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); new_attrs.sfnum_valid = true; } err = devlink->ops->port_new(devlink, &new_attrs, extack, &devlink_port); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err_out_port_del; } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, NULL); if (WARN_ON_ONCE(err)) goto err_out_msg_free; err = genlmsg_reply(msg, info); if (err) goto err_out_port_del; return 0; err_out_msg_free: nlmsg_free(msg); err_out_port_del: devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_del) return -EOPNOTSUPP; return devlink_port->ops->port_del(devlink, devlink_port, extack); } static void devlink_port_type_warn(struct work_struct *work) { struct devlink_port *port = container_of(to_delayed_work(work), struct devlink_port, type_warn_dw); dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; /* Schedule a work to WARN in case driver does not set port * type within timeout. */ schedule_delayed_work(&devlink_port->type_warn_dw, DEVLINK_PORT_TYPE_WARN_TIMEOUT); } static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; cancel_delayed_work_sync(&devlink_port->type_warn_dw); } /** * devlink_port_init() - Init devlink port * * @devlink: devlink * @devlink_port: devlink port * * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_init(struct devlink *devlink, struct devlink_port *devlink_port) { if (devlink_port->initialized) return; devlink_port->devlink = devlink; INIT_LIST_HEAD(&devlink_port->region_list); devlink_port->initialized = true; } EXPORT_SYMBOL_GPL(devlink_port_init); /** * devlink_port_fini() - Deinitialize devlink port * * @devlink_port: devlink port * * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_fini(struct devlink_port *devlink_port) { WARN_ON(!list_empty(&devlink_port->region_list)); } EXPORT_SYMBOL_GPL(devlink_port_fini); static const struct devlink_port_ops devlink_port_dummy_ops = {}; /** * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. */ int devl_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); if (err) { devlink_port->registered = false; return err; } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. * * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_lock(devlink); err = devl_port_register_with_ops(devlink, devlink_port, port_index, ops); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port * * @devlink_port: devlink port */ void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); /** * devlink_port_unregister - Unregister devlink port * * @devlink_port: devlink port * * Context: Takes and release devlink->lock <mutex>. */ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; devl_lock(devlink); devl_port_unregister(devlink_port); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; /* If driver registers devlink port, it should set devlink port * attributes accordingly so the compat functions are called * and the original ops are not used. */ if (ops->ndo_get_phys_port_name) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_phys_port_name * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ char name[IFNAMSIZ]; int err; err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); WARN_ON(err != -EOPNOTSUPP); } if (ops->ndo_get_port_parent_id) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_port_parent_id * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ struct netdev_phys_item_id ppid; int err; err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } } static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { struct net_device *netdev = type_dev; ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (type == DEVLINK_PORT_TYPE_NOTSET) { devlink_port_type_warn_schedule(devlink_port); } else { devlink_port_type_warn_cancel(devlink_port); if (type == DEVLINK_PORT_TYPE_ETH && netdev) devlink_port_type_netdev_checks(devlink_port, netdev); } spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; if (netdev) { ASSERT_RTNL(); devlink_port->type_eth.ifindex = netdev->ifindex; BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != sizeof(netdev->name)); strcpy(devlink_port->type_eth.ifname, netdev->name); } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; break; default: break; } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port * * If driver is calling this, most likely it is doing something wrong. */ void devlink_port_type_eth_set(struct devlink_port *devlink_port) { dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); /** * devlink_port_type_ib_set - Set port type to InfiniBand * * @devlink_port: devlink port * @ibdev: related IB device */ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); /** * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port * * If driver is calling this for clearing Ethernet type, most likely * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; if (!devlink_port) return NOTIFY_OK; devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: /* Set the type but not netdev pointer. It is going to be set * later on by NETDEV_REGISTER event. Happens once during * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); break; } return NOTIFY_OK; } static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { devlink_port->switch_port = true; if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { devlink_port->switch_port = false; } return 0; } /** * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); /** * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PCI function number for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); /** * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PCI function number for the devlink port instance * @vf: associated PCI VF number of a PF for the devlink port instance; * VF number starts from 0 for the first PCI virtual function * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); /** * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PCI function number for the devlink port instance * @sf: associated SF number of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) return; attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, u32 rel_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (devlink_port && devlink_port->rel_index == rel_index) devlink_port->rel_index = 0; } /** * devl_port_fn_devlink_set - Attach peer devlink * instance to port function. * @devlink_port: devlink port * @fn_devlink: devlink instance to attach */ int devl_port_fn_devlink_set(struct devlink_port *devlink_port, struct devlink *fn_devlink) { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || devlink_port->attrs.pci_sf.external)) return -EINVAL; return devlink_rel_nested_in_add(&devlink_port->rel_index, devlink_port->devlink->index, devlink_port->index, devlink_port_rel_notify_cb, devlink_port_rel_cleanup_cb, fn_devlink); } EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); /** * devlink_port_linecard_set - Link port with a linecard * * @devlink_port: devlink port * @linecard: devlink linecard */ void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (devlink_port->linecard) n = snprintf(name, len, "l%u", devlink_linecard_index(devlink_port->linecard)); if (n < len) n += snprintf(name + n, len - n, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: if (attrs->pci_pf.external) { n = snprintf(name, len, "c%u", attrs->pci_pf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (attrs->pci_vf.external) { n = snprintf(name, len, "c%u", attrs->pci_vf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (attrs->pci_sf.external) { n = snprintf(name, len, "c%u", attrs->pci_sf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; case DEVLINK_PORT_FLAVOUR_VIRTUAL: return -EOPNOTSUPP; } if (n >= len) return -EINVAL; return 0; } int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { struct devlink_port *devlink_port; /* RTNL mutex is held here which ensures that devlink_port * instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ ASSERT_RTNL(); devlink_port = dev->devlink_port; if (!devlink_port) return -EOPNOTSUPP; return __devlink_port_phys_port_name_get(devlink_port, name, len); } int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid) { struct devlink_port *devlink_port; /* Caller must hold RTNL mutex or reference to dev, which ensures that * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ devlink_port = dev->devlink_port; if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); return 0; }
526 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2005-2010 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: evm.h */ #ifndef __INTEGRITY_EVM_H #define __INTEGRITY_EVM_H #include <linux/xattr.h> #include <linux/security.h> #include "../integrity.h" #define EVM_INIT_HMAC 0x0001 #define EVM_INIT_X509 0x0002 #define EVM_ALLOW_METADATA_WRITES 0x0004 #define EVM_SETUP_COMPLETE 0x80000000 /* userland has signaled key load */ #define EVM_KEY_MASK (EVM_INIT_HMAC | EVM_INIT_X509) #define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP_COMPLETE | \ EVM_ALLOW_METADATA_WRITES) struct xattr_list { struct list_head list; char *name; bool enabled; }; #define EVM_NEW_FILE 0x00000001 #define EVM_IMMUTABLE_DIGSIG 0x00000002 /* EVM integrity metadata associated with an inode */ struct evm_iint_cache { unsigned long flags; enum integrity_status evm_status:4; struct integrity_inode_attributes metadata_inode; }; extern struct lsm_blob_sizes evm_blob_sizes; static inline struct evm_iint_cache *evm_iint_inode(const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + evm_blob_sizes.lbs_inode; } extern int evm_initialized; #define EVM_ATTR_FSUUID 0x0001 extern int evm_hmac_attrs; /* List of EVM protected security xattrs */ extern struct list_head evm_config_xattrnames; struct evm_digest { struct ima_digest_data_hdr hdr; char digest[IMA_MAX_DIGEST_SIZE]; } __packed; int evm_protected_xattr(const char *req_xattr_name); int evm_init_key(void); int evm_update_evmxattr(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len); int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len, struct evm_digest *data, struct evm_iint_cache *iint); int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len, char type, struct evm_digest *data, struct evm_iint_cache *iint); int evm_init_hmac(struct inode *inode, const struct xattr *xattrs, char *hmac_val); int evm_init_secfs(void); #endif
5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
55 88 88 88 86 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the interrupt descriptor management code. Detailed * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "internals.h" /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with * bugreports caused by random commandline masks */ cpumask_set_cpu(smp_processor_id(), irq_default_affinity); return 1; } __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) { } #endif #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif return 0; } static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { if (!affinity) affinity = irq_default_affinity; cpumask_copy(desc->irq_common_data.affinity, affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif #ifdef CONFIG_NUMA desc->irq_common_data.node = node; #endif } static void free_masks(struct irq_desc *desc) { #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif } #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, const struct cpumask *affinity, struct module *owner) { int cpu; desc->irq_common_data.handler_data = NULL; desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; desc_smp_init(desc, node, affinity); } static unsigned int nr_irqs = NR_IRQS; /** * irq_get_nr_irqs() - Number of interrupts supported by the system. */ unsigned int irq_get_nr_irqs(void) { return nr_irqs; } EXPORT_SYMBOL_GPL(irq_get_nr_irqs); /** * irq_set_nr_irqs() - Set the number of interrupts supported by the system. * @nr: New number of interrupts. * * Return: @nr. */ unsigned int irq_set_nr_irqs(unsigned int nr) { nr_irqs = nr; return nr; } EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU, sparse_irq_lock); static int irq_find_free_area(unsigned int from, unsigned int cnt) { MA_STATE(mas, &sparse_irqs, 0, 0); if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) return -ENOSPC; return mas.index; } static unsigned int irq_find_at_or_after(unsigned int offset) { unsigned long index = offset; struct irq_desc *desc; guard(rcu)(); desc = mt_find(&sparse_irqs, &index, nr_irqs); return desc ? irq_desc_get_irq(desc) : nr_irqs; } static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { MA_STATE(mas, &sparse_irqs, irq, irq); WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); } static void delete_irq_desc(unsigned int irq) { MA_STATE(mas, &sparse_irqs, irq, irq); mas_erase(&mas); } #ifdef CONFIG_SPARSE_IRQ static const struct kobj_type irq_kobj_type; #endif static int init_desc(struct irq_desc *desc, int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { desc->kstat_irqs = alloc_percpu(struct irqstat); if (!desc->kstat_irqs) return -ENOMEM; if (alloc_masks(desc, node)) { free_percpu(desc->kstat_irqs); return -ENOMEM; } raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); irq_resend_init(desc); #ifdef CONFIG_SPARSE_IRQ kobject_init(&desc->kobj, &irq_kobj_type); init_rcu_head(&desc->rcu); #endif return 0; } #ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); #ifdef CONFIG_SYSFS static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; char *p = ""; int cpu; for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); p = ","; } ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); if (desc->irq_data.chip && desc->irq_data.chip->name) { ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->irq_data.chip->name); } raw_spin_unlock_irq(&desc->lock); return ret; } IRQ_ATTR_RO(chip_name); static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); if (desc->irq_data.domain) ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq); raw_spin_unlock_irq(&desc->lock); return ret; } IRQ_ATTR_RO(hwirq); static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); ret = sprintf(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); raw_spin_unlock_irq(&desc->lock); return ret; } IRQ_ATTR_RO(type); static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); raw_spin_unlock_irq(&desc->lock); return ret; } IRQ_ATTR_RO(wakeup); static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); if (desc->name) ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); raw_spin_unlock_irq(&desc->lock); return ret; } IRQ_ATTR_RO(name); static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; raw_spin_lock_irq(&desc->lock); for_each_action_of_desc(desc, action) { ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", p, action->name); p = ","; } raw_spin_unlock_irq(&desc->lock); if (ret) ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); return ret; } IRQ_ATTR_RO(actions); static struct attribute *irq_attrs[] = { &per_cpu_count_attr.attr, &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL }; ATTRIBUTE_GROUPS(irq); static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) { if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing * crucial and failures in the late irq_sysfs_init() * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); else desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* * Only invoke kobject_del() when kobject_add() was successfully * invoked for the descriptor. This covers both early boot, where * sysfs is not initialized yet, and the case of a failed * kobject_add() invocation. */ if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } static int __init irq_sysfs_init(void) { struct irq_desc *desc; int irq; /* Prevent concurrent irq alloc/free */ irq_lock_sparse(); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); if (!irq_kobj_base) { irq_unlock_sparse(); return -ENOMEM; } /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); irq_unlock_sparse(); return 0; } postcore_initcall(irq_sysfs_init); #else /* !CONFIG_SYSFS */ static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ struct irq_desc *irq_to_desc(unsigned int irq) { return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); } void irq_unlock_sparse(void) { mutex_unlock(&sparse_irq_lock); } static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; ret = init_desc(desc, irq, node, flags, affinity, owner); if (unlikely(ret)) { kfree(desc); return NULL; } return desc; } static void irq_kobj_release(struct kobject *kobj) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } static void delayed_free_desc(struct rcu_head *rhp) { struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); kobject_put(&desc->kobj); } static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* * sparse_irq_lock protects also show_interrupts() and * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. * * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ irq_sysfs_del(desc); delete_irq_desc(irq); /* * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { struct irq_desc *desc; int i; /* Validate affinity mask(s) */ if (affinity) { for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } for (i = 0; i < cnt; i++) { const struct cpumask *mask = NULL; unsigned int flags = 0; if (affinity) { if (affinity->is_managed) { flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } return start; err: for (i--; i >= 0; i--) free_desc(start + i); return -ENOMEM; } static int irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) return -ENOMEM; nr_irqs = nr; return 0; } int __init early_irq_init(void) { int i, initcnt, node = first_online_node; struct irq_desc *desc; init_irq_default_affinity(); /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) nr_irqs = MAX_SPARSE_IRQS; if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); irq_insert_desc(i, desc); } return arch_early_irq_init(); } #else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; int __init early_irq_init(void) { int count, i, node = first_online_node; int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); if (unlikely(ret)) goto __free_desc_res; } return arch_early_irq_init(); __free_desc_res: while (--i >= 0) { free_masks(irq_desc + i); free_percpu(irq_desc[i].kstat_irqs); } return ret; } struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; for (i = 0; i < cnt; i++) { struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; irq_insert_desc(start + i, desc); } return start; } static int irq_expand_nr_irqs(unsigned int nr) { return -ENOMEM; } void irq_mark_irq(unsigned int irq) { mutex_lock(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); mutex_unlock(&sparse_irq_lock); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq) { free_desc(irq); } #endif #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) { struct irq_data *data; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); return 0; } /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_irq(unsigned int irq) { return handle_irq_desc(irq_to_desc(irq)); } EXPORT_SYMBOL_GPL(generic_handle_irq); /** * generic_handle_irq_safe - Invoke the handler for a particular irq from any * context. * @irq: The irq number to handle * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process context). It * will report an error if not invoked from IRQ context and the irq has been * marked to enforce IRQ-context only. */ int generic_handle_irq_safe(unsigned int irq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_to_desc(irq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_irq_safe); #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); /** * generic_handle_irq_safe - Invoke the handler for a HW irq belonging * to a domain from any context. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an NMI context with irq regs * initialized. **/ int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } #endif /* Dynamic interrupt handling */ /** * irq_free_descs - free irq descriptors * @from: Start of descriptor range * @cnt: Number of consecutive irqs to free */ void irq_free_descs(unsigned int from, unsigned int cnt) { int i; if (from >= nr_irqs || (from + cnt) > nr_irqs) return; mutex_lock(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); mutex_unlock(&sparse_irq_lock); } EXPORT_SYMBOL_GPL(irq_free_descs); /** * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) * @affinity: Optional pointer to an affinity mask array of size @cnt which * hints where the irq descriptors should be allocated and which * default affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity) { int start, ret; if (!cnt) return -EINVAL; if (irq >= 0) { if (from > irq) return -EINVAL; from = irq; } else { /* * For interrupts which are freely allocated the * architecture can force a lower bound to the @from * argument. x86 uses this to exclude the GSI space. */ from = arch_dynirq_lower_bound(from); } mutex_lock(&sparse_irq_lock); start = irq_find_free_area(from, cnt); ret = -EEXIST; if (irq >=0 && start != irq) goto unlock; if (start + cnt > nr_irqs) { ret = irq_expand_nr_irqs(start + cnt); if (ret) goto unlock; } ret = alloc_descs(start, cnt, node, affinity, owner); unlock: mutex_unlock(&sparse_irq_lock); return ret; } EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * * Returns next irq number after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { return irq_find_at_or_after(offset); } struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { if (check & _IRQ_DESC_CHECK) { if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) return NULL; if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) return NULL; } if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); } return desc; } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) chip_bus_sync_unlock(desc); } int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || desc->percpu_enabled) return -EINVAL; desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); if (!desc->percpu_enabled) return -ENOMEM; desc->percpu_affinity = affinity ? : cpu_possible_mask; irq_set_percpu_devid_flags(irq); return 0; } int irq_set_percpu_devid(unsigned int irq) { return irq_set_percpu_devid_partition(irq, NULL); } int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->percpu_enabled) return -EINVAL; if (affinity) cpumask_copy(affinity, desc->percpu_affinity); return 0; } EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu * @irq: The interrupt number * @cpu: The cpu number * * Returns the sum of interrupt counts on @cpu since boot for * @irq. The caller must ensure that the interrupt is not removed * concurrently. */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) return data_race(desc->tot_count); for_each_cpu(cpu, cpumask) sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return kstat_irqs_desc(desc, cpu_possible_mask); } #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT void kstat_snapshot_irqs(void) { struct irq_desc *desc; unsigned int irq; for_each_irq_desc(irq, desc) { if (!desc->kstat_irqs) continue; this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt)); } } unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref); } #endif /** * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. * * It uses rcu to protect the access since a concurrent removal of an * interrupt descriptor is observing an rcu grace period before * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; rcu_read_lock(); sum = kstat_irqs(irq); rcu_read_unlock(); return sum; } #ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { lockdep_set_class(&desc->lock, lock_class); lockdep_set_class(&desc->request_mutex, request_class); } } EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); #endif
30 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Linaro * Author: Christoffer Dall <christoffer.dall@linaro.org> */ #include <linux/cpu.h> #include <linux/debugfs.h> #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/seq_file.h> #include <kvm/arm_vgic.h> #include <asm/kvm_mmu.h> #include "vgic.h" /* * Structure to control looping through the entire vgic state. We start at * zero for each field and move upwards. So, if dist_id is 0 we print the * distributor info. When dist_id is 1, we have already printed it and move * on. * * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and * so on. */ struct vgic_state_iter { int nr_cpus; int nr_spis; int nr_lpis; int dist_id; int vcpu_id; unsigned long intid; int lpi_idx; }; static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) { struct vgic_dist *dist = &kvm->arch.vgic; if (iter->dist_id == 0) { iter->dist_id++; return; } /* * Let the xarray drive the iterator after the last SPI, as the iterator * has exhausted the sequentially-allocated INTID space. */ if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && iter->nr_lpis) { if (iter->lpi_idx < iter->nr_lpis) xa_find_after(&dist->lpi_xa, &iter->intid, VGIC_LPI_MAX_INTID, LPI_XA_MARK_DEBUG_ITER); iter->lpi_idx++; return; } iter->intid++; if (iter->intid == VGIC_NR_PRIVATE_IRQS && ++iter->vcpu_id < iter->nr_cpus) iter->intid = 0; } static int iter_mark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; unsigned long intid; int nr_lpis = 0; xa_for_each(&dist->lpi_xa, intid, irq) { if (!vgic_try_get_irq_kref(irq)) continue; xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); nr_lpis++; } return nr_lpis; } static void iter_unmark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; unsigned long intid; xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); vgic_put_irq(kvm, irq); } } static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, loff_t pos) { int nr_cpus = atomic_read(&kvm->online_vcpus); memset(iter, 0, sizeof(*iter)); iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) iter->nr_lpis = iter_mark_lpis(kvm); /* Fast forward to the right position if needed */ while (pos--) iter_next(kvm, iter); } static bool end_of_vgic(struct vgic_state_iter *iter) { return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; struct vgic_state_iter *iter; mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; if (iter) { iter = ERR_PTR(-EBUSY); goto out; } iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { iter = ERR_PTR(-ENOMEM); goto out; } iter_init(kvm, iter, *pos); kvm->arch.vgic.iter = iter; if (end_of_vgic(iter)) iter = NULL; out: mutex_unlock(&kvm->arch.config_lock); return iter; } static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct kvm *kvm = s->private; struct vgic_state_iter *iter = kvm->arch.vgic.iter; ++*pos; iter_next(kvm, iter); if (end_of_vgic(iter)) iter = NULL; return iter; } static void vgic_debug_stop(struct seq_file *s, void *v) { struct kvm *kvm = s->private; struct vgic_state_iter *iter; /* * If the seq file wasn't properly opened, there's nothing to clearn * up. */ if (IS_ERR(v)) return; mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; iter_unmark_lpis(kvm); kfree(iter); kvm->arch.vgic.iter = NULL; mutex_unlock(&kvm->arch.config_lock); } static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, struct vgic_state_iter *iter) { bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; seq_printf(s, "Distributor\n"); seq_printf(s, "===========\n"); seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); seq_printf(s, "G=group\n"); } static void print_header(struct seq_file *s, struct vgic_irq *irq, struct kvm_vcpu *vcpu) { int id = 0; char *hdr = "SPI "; if (vcpu) { hdr = "VCPU"; id = vcpu->vcpu_idx; } seq_printf(s, "\n"); seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); seq_printf(s, "----------------------------------------------------------------\n"); } static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, struct kvm_vcpu *vcpu) { char *type; bool pending; if (irq->intid < VGIC_NR_SGIS) type = "SGI"; else if (irq->intid < VGIC_NR_PRIVATE_IRQS) type = "PPI"; else if (irq->intid < VGIC_MAX_SPI) type = "SPI"; else type = "LPI"; if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) print_header(s, irq, vcpu); pending = irq->pending_latch; if (irq->hw && vgic_irq_is_sgi(irq->intid)) { int err; err = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, &pending); WARN_ON_ONCE(err); } seq_printf(s, " %s %4d " " %2d " "%d%d%d%d%d%d%d " "%8d " "%8x " " %2x " "%3d " " %2d " "\n", type, irq->intid, (irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1, pending, irq->line_level, irq->active, irq->enabled, irq->hw, irq->config == VGIC_CONFIG_LEVEL, irq->group, irq->hwintid, irq->mpidr, irq->source, irq->priority, (irq->vcpu) ? irq->vcpu->vcpu_idx : -1); } static int vgic_debug_show(struct seq_file *s, void *v) { struct kvm *kvm = s->private; struct vgic_state_iter *iter = v; struct vgic_irq *irq; struct kvm_vcpu *vcpu = NULL; unsigned long flags; if (iter->dist_id == 0) { print_dist_state(s, &kvm->arch.vgic, iter); return 0; } if (!kvm->arch.vgic.initialized) return 0; if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); /* * Expect this to succeed, as iter_mark_lpis() takes a reference on * every LPI to be visited. */ if (iter->intid < VGIC_NR_PRIVATE_IRQS) irq = vgic_get_vcpu_irq(vcpu, iter->intid); else irq = vgic_get_irq(kvm, iter->intid); if (WARN_ON_ONCE(!irq)) return -EINVAL; raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); return 0; } static const struct seq_operations vgic_debug_sops = { .start = vgic_debug_start, .next = vgic_debug_next, .stop = vgic_debug_stop, .show = vgic_debug_show }; DEFINE_SEQ_ATTRIBUTE(vgic_debug); void vgic_debug_init(struct kvm *kvm) { debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, &vgic_debug_fops); } void vgic_debug_destroy(struct kvm *kvm) { }
81 716 86 1025 80 1021 138 81 81 718 373 174 338 519 418 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> */ #ifndef __ASM_CPUFEATURE_H #define __ASM_CPUFEATURE_H #include <asm/alternative-macros.h> #include <asm/cpucaps.h> #include <asm/cputype.h> #include <asm/hwcap.h> #include <asm/sysreg.h> #define MAX_CPU_FEATURES 192 #define cpu_feature(x) KERNEL_HWCAP_ ## x #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 #define ARM64_SW_FEATURE_OVERRIDE_HVHE 4 #define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF 8 #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/cpumask.h> /* * CPU feature register tracking * * The safe value of a CPUID feature field is dependent on the implications * of the values assigned to it by the architecture. Based on the relationship * between the values, the features are classified into 3 types - LOWER_SAFE, * HIGHER_SAFE and EXACT. * * The lowest value of all the CPUs is chosen for LOWER_SAFE and highest * for HIGHER_SAFE. It is expected that all CPUs have the same value for * a field when EXACT is specified, failing which, the safe value specified * in the table is chosen. */ enum ftr_type { FTR_EXACT, /* Use a predefined safe value */ FTR_LOWER_SAFE, /* Smaller value is safe */ FTR_HIGHER_SAFE, /* Bigger value is safe */ FTR_HIGHER_OR_ZERO_SAFE, /* Bigger value is safe, but 0 is biggest */ }; #define FTR_STRICT true /* SANITY check strict matching required */ #define FTR_NONSTRICT false /* SANITY check ignored */ #define FTR_SIGNED true /* Value should be treated as signed */ #define FTR_UNSIGNED false /* Value should be treated as unsigned */ #define FTR_VISIBLE true /* Feature visible to the user space */ #define FTR_HIDDEN false /* Feature is hidden from the user */ #define FTR_VISIBLE_IF_IS_ENABLED(config) \ (IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN) struct arm64_ftr_bits { bool sign; /* Value is signed ? */ bool visible; bool strict; /* CPU Sanity check: strict matching required ? */ enum ftr_type type; u8 shift; u8 width; s64 safe_val; /* safe value for FTR_EXACT features */ }; /* * Describe the early feature override to the core override code: * * @val Values that are to be merged into the final * sanitised value of the register. Only the bitfields * set to 1 in @mask are valid * @mask Mask of the features that are overridden by @val * * A @mask field set to full-1 indicates that the corresponding field * in @val is a valid override. * * A @mask field set to full-0 with the corresponding @val field set * to full-0 denotes that this field has no override * * A @mask field set to full-0 with the corresponding @val field set * to full-1 denotes that this field has an invalid override. */ struct arm64_ftr_override { u64 val; u64 mask; }; /* * @arm64_ftr_reg - Feature register * @strict_mask Bits which should match across all CPUs for sanity. * @sys_val Safe value across the CPUs (system view) */ struct arm64_ftr_reg { const char *name; u64 strict_mask; u64 user_mask; u64 sys_val; u64 user_val; struct arm64_ftr_override *override; const struct arm64_ftr_bits *ftr_bits; }; extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0; /* * CPU capabilities: * * We use arm64_cpu_capabilities to represent system features, errata work * arounds (both used internally by kernel and tracked in system_cpucaps) and * ELF HWCAPs (which are exposed to user). * * To support systems with heterogeneous CPUs, we need to make sure that we * detect the capabilities correctly on the system and take appropriate * measures to ensure there are no incompatibilities. * * This comment tries to explain how we treat the capabilities. * Each capability has the following list of attributes : * * 1) Scope of Detection : The system detects a given capability by * performing some checks at runtime. This could be, e.g, checking the * value of a field in CPU ID feature register or checking the cpu * model. The capability provides a call back ( @matches() ) to * perform the check. Scope defines how the checks should be performed. * There are three cases: * * a) SCOPE_LOCAL_CPU: check all the CPUs and "detect" if at least one * matches. This implies, we have to run the check on all the * booting CPUs, until the system decides that state of the * capability is finalised. (See section 2 below) * Or * b) SCOPE_SYSTEM: check all the CPUs and "detect" if all the CPUs * matches. This implies, we run the check only once, when the * system decides to finalise the state of the capability. If the * capability relies on a field in one of the CPU ID feature * registers, we use the sanitised value of the register from the * CPU feature infrastructure to make the decision. * Or * c) SCOPE_BOOT_CPU: Check only on the primary boot CPU to detect the * feature. This category is for features that are "finalised" * (or used) by the kernel very early even before the SMP cpus * are brought up. * * The process of detection is usually denoted by "update" capability * state in the code. * * 2) Finalise the state : The kernel should finalise the state of a * capability at some point during its execution and take necessary * actions if any. Usually, this is done, after all the boot-time * enabled CPUs are brought up by the kernel, so that it can make * better decision based on the available set of CPUs. However, there * are some special cases, where the action is taken during the early * boot by the primary boot CPU. (e.g, running the kernel at EL2 with * Virtualisation Host Extensions). The kernel usually disallows any * changes to the state of a capability once it finalises the capability * and takes any action, as it may be impossible to execute the actions * safely. A CPU brought up after a capability is "finalised" is * referred to as "Late CPU" w.r.t the capability. e.g, all secondary * CPUs are treated "late CPUs" for capabilities determined by the boot * CPU. * * At the moment there are two passes of finalising the capabilities. * a) Boot CPU scope capabilities - Finalised by primary boot CPU via * setup_boot_cpu_capabilities(). * b) Everything except (a) - Run via setup_system_capabilities(). * * 3) Verification: When a CPU is brought online (e.g, by user or by the * kernel), the kernel should make sure that it is safe to use the CPU, * by verifying that the CPU is compliant with the state of the * capabilities finalised already. This happens via : * * secondary_start_kernel()-> check_local_cpu_capabilities() * * As explained in (2) above, capabilities could be finalised at * different points in the execution. Each newly booted CPU is verified * against the capabilities that have been finalised by the time it * boots. * * a) SCOPE_BOOT_CPU : All CPUs are verified against the capability * except for the primary boot CPU. * * b) SCOPE_LOCAL_CPU, SCOPE_SYSTEM: All CPUs hotplugged on by the * user after the kernel boot are verified against the capability. * * If there is a conflict, the kernel takes an action, based on the * severity (e.g, a CPU could be prevented from booting or cause a * kernel panic). The CPU is allowed to "affect" the state of the * capability, if it has not been finalised already. See section 5 * for more details on conflicts. * * 4) Action: As mentioned in (2), the kernel can take an action for each * detected capability, on all CPUs on the system. Appropriate actions * include, turning on an architectural feature, modifying the control * registers (e.g, SCTLR, TCR etc.) or patching the kernel via * alternatives. The kernel patching is batched and performed at later * point. The actions are always initiated only after the capability * is finalised. This is usally denoted by "enabling" the capability. * The actions are initiated as follows : * a) Action is triggered on all online CPUs, after the capability is * finalised, invoked within the stop_machine() context from * enable_cpu_capabilitie(). * * b) Any late CPU, brought up after (1), the action is triggered via: * * check_local_cpu_capabilities() -> verify_local_cpu_capabilities() * * 5) Conflicts: Based on the state of the capability on a late CPU vs. * the system state, we could have the following combinations : * * x-----------------------------x * | Type | System | Late CPU | * |-----------------------------| * | a | y | n | * |-----------------------------| * | b | n | y | * x-----------------------------x * * Two separate flag bits are defined to indicate whether each kind of * conflict can be allowed: * ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU - Case(a) is allowed * ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU - Case(b) is allowed * * Case (a) is not permitted for a capability that the system requires * all CPUs to have in order for the capability to be enabled. This is * typical for capabilities that represent enhanced functionality. * * Case (b) is not permitted for a capability that must be enabled * during boot if any CPU in the system requires it in order to run * safely. This is typical for erratum work arounds that cannot be * enabled after the corresponding capability is finalised. * * In some non-typical cases either both (a) and (b), or neither, * should be permitted. This can be described by including neither * or both flags in the capability's type field. * * In case of a conflict, the CPU is prevented from booting. If the * ARM64_CPUCAP_PANIC_ON_CONFLICT flag is specified for the capability, * then a kernel panic is triggered. */ /* * Decide how the capability is detected. * On any local CPU vs System wide vs the primary boot CPU */ #define ARM64_CPUCAP_SCOPE_LOCAL_CPU ((u16)BIT(0)) #define ARM64_CPUCAP_SCOPE_SYSTEM ((u16)BIT(1)) /* * The capabilitiy is detected on the Boot CPU and is used by kernel * during early boot. i.e, the capability should be "detected" and * "enabled" as early as possibly on all booting CPUs. */ #define ARM64_CPUCAP_SCOPE_BOOT_CPU ((u16)BIT(2)) #define ARM64_CPUCAP_SCOPE_MASK \ (ARM64_CPUCAP_SCOPE_SYSTEM | \ ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ ARM64_CPUCAP_SCOPE_BOOT_CPU) #define SCOPE_SYSTEM ARM64_CPUCAP_SCOPE_SYSTEM #define SCOPE_LOCAL_CPU ARM64_CPUCAP_SCOPE_LOCAL_CPU #define SCOPE_BOOT_CPU ARM64_CPUCAP_SCOPE_BOOT_CPU #define SCOPE_ALL ARM64_CPUCAP_SCOPE_MASK /* * Is it permitted for a late CPU to have this capability when system * hasn't already enabled it ? */ #define ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU ((u16)BIT(4)) /* Is it safe for a late CPU to miss this capability when system has it */ #define ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU ((u16)BIT(5)) /* Panic when a conflict is detected */ #define ARM64_CPUCAP_PANIC_ON_CONFLICT ((u16)BIT(6)) /* * CPU errata workarounds that need to be enabled at boot time if one or * more CPUs in the system requires it. When one of these capabilities * has been enabled, it is safe to allow any CPU to boot that doesn't * require the workaround. However, it is not safe if a "late" CPU * requires a workaround and the system hasn't enabled it already. */ #define ARM64_CPUCAP_LOCAL_CPU_ERRATUM \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU) /* * CPU feature detected at boot time based on system-wide value of a * feature. It is safe for a late CPU to have this feature even though * the system hasn't enabled it, although the feature will not be used * by Linux in this case. If the system has enabled this feature already, * then every late CPU must have it. */ #define ARM64_CPUCAP_SYSTEM_FEATURE \ (ARM64_CPUCAP_SCOPE_SYSTEM | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) /* * CPU feature detected at boot time based on feature of one or more CPUs. * All possible conflicts for a late CPU are ignored. * NOTE: this means that a late CPU with the feature will *not* cause the * capability to be advertised by cpus_have_*cap()! */ #define ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU | \ ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) /* * CPU feature detected at boot time, on one or more CPUs. A late CPU * is not allowed to have the capability when the system doesn't have it. * It is Ok for a late CPU to miss the feature. */ #define ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU) /* * CPU feature used early in the boot based on the boot CPU. All secondary * CPUs must match the state of the capability as detected by the boot CPU. In * case of a conflict, a kernel panic is triggered. */ #define ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_BOOT_CPU | ARM64_CPUCAP_PANIC_ON_CONFLICT) /* * CPU feature used early in the boot based on the boot CPU. It is safe for a * late CPU to have this feature even though the boot CPU hasn't enabled it, * although the feature will not be used by Linux in this case. If the boot CPU * has enabled this feature already, then every late CPU must have it. */ #define ARM64_CPUCAP_BOOT_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_BOOT_CPU | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) struct arm64_cpu_capabilities { const char *desc; u16 capability; u16 type; bool (*matches)(const struct arm64_cpu_capabilities *caps, int scope); /* * Take the appropriate actions to configure this capability * for this CPU. If the capability is detected by the kernel * this will be called on all the CPUs in the system, * including the hotplugged CPUs, regardless of whether the * capability is available on that specific CPU. This is * useful for some capabilities (e.g, working around CPU * errata), where all the CPUs must take some action (e.g, * changing system control/configuration). Thus, if an action * is required only if the CPU has the capability, then the * routine must check it before taking any action. */ void (*cpu_enable)(const struct arm64_cpu_capabilities *cap); union { struct { /* To be used for erratum handling only */ struct midr_range midr_range; const struct arm64_midr_revidr { u32 midr_rv; /* revision/variant */ u32 revidr_mask; } * const fixed_revs; }; const struct midr_range *midr_range_list; struct { /* Feature register checking */ u32 sys_reg; u8 field_pos; u8 field_width; u8 min_field_value; u8 max_field_value; u8 hwcap_type; bool sign; unsigned long hwcap; }; }; /* * An optional list of "matches/cpu_enable" pair for the same * "capability" of the same "type" as described by the parent. * Only matches(), cpu_enable() and fields relevant to these * methods are significant in the list. The cpu_enable is * invoked only if the corresponding entry "matches()". * However, if a cpu_enable() method is associated * with multiple matches(), care should be taken that either * the match criteria are mutually exclusive, or that the * method is robust against being called multiple times. */ const struct arm64_cpu_capabilities *match_list; const struct cpumask *cpus; }; static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap) { return cap->type & ARM64_CPUCAP_SCOPE_MASK; } /* * Generic helper for handling capabilities with multiple (match,enable) pairs * of call backs, sharing the same capability bit. * Iterate over each entry to see if at least one matches. */ static inline bool cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry, int scope) { const struct arm64_cpu_capabilities *caps; for (caps = entry->match_list; caps->matches; caps++) if (caps->matches(caps, scope)) return true; return false; } static __always_inline bool is_vhe_hyp_code(void) { /* Only defined for code run in VHE hyp context */ return __is_defined(__KVM_VHE_HYPERVISOR__); } static __always_inline bool is_nvhe_hyp_code(void) { /* Only defined for code run in NVHE hyp context */ return __is_defined(__KVM_NVHE_HYPERVISOR__); } static __always_inline bool is_hyp_code(void) { return is_vhe_hyp_code() || is_nvhe_hyp_code(); } extern DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); extern DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); #define for_each_available_cap(cap) \ for_each_set_bit(cap, system_cpucaps, ARM64_NCAPS) bool this_cpu_has_cap(unsigned int cap); void cpu_set_feature(unsigned int num); bool cpu_have_feature(unsigned int num); unsigned long cpu_get_elf_hwcap(void); unsigned long cpu_get_elf_hwcap2(void); unsigned long cpu_get_elf_hwcap3(void); #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name)) #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) static __always_inline bool boot_capabilities_finalized(void) { return alternative_has_cap_likely(ARM64_ALWAYS_BOOT); } static __always_inline bool system_capabilities_finalized(void) { return alternative_has_cap_likely(ARM64_ALWAYS_SYSTEM); } /* * Test for a capability with a runtime check. * * Before the capability is detected, this returns false. */ static __always_inline bool cpus_have_cap(unsigned int num) { if (__builtin_constant_p(num) && !cpucap_is_possible(num)) return false; if (num >= ARM64_NCAPS) return false; return arch_test_bit(num, system_cpucaps); } /* * Test for a capability without a runtime check. * * Before boot capabilities are finalized, this will BUG(). * After boot capabilities are finalized, this is patched to avoid a runtime * check. * * @num must be a compile-time constant. */ static __always_inline bool cpus_have_final_boot_cap(int num) { if (boot_capabilities_finalized()) return alternative_has_cap_unlikely(num); else BUG(); } /* * Test for a capability without a runtime check. * * Before system capabilities are finalized, this will BUG(). * After system capabilities are finalized, this is patched to avoid a runtime * check. * * @num must be a compile-time constant. */ static __always_inline bool cpus_have_final_cap(int num) { if (system_capabilities_finalized()) return alternative_has_cap_unlikely(num); else BUG(); } static inline int __attribute_const__ cpuid_feature_extract_signed_field_width(u64 features, int field, int width) { return (s64)(features << (64 - width - field)) >> (64 - width); } static inline int __attribute_const__ cpuid_feature_extract_signed_field(u64 features, int field) { return cpuid_feature_extract_signed_field_width(features, field, 4); } static __always_inline unsigned int __attribute_const__ cpuid_feature_extract_unsigned_field_width(u64 features, int field, int width) { return (u64)(features << (64 - width - field)) >> (64 - width); } static __always_inline unsigned int __attribute_const__ cpuid_feature_extract_unsigned_field(u64 features, int field) { return cpuid_feature_extract_unsigned_field_width(features, field, 4); } /* * Fields that identify the version of the Performance Monitors Extension do * not follow the standard ID scheme. See ARM DDI 0487E.a page D13-2825, * "Alternative ID scheme used for the Performance Monitors Extension version". */ static inline u64 __attribute_const__ cpuid_feature_cap_perfmon_field(u64 features, int field, u64 cap) { u64 val = cpuid_feature_extract_unsigned_field(features, field); u64 mask = GENMASK_ULL(field + 3, field); /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ if (val == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val = 0; if (val > cap) { features &= ~mask; features |= (cap << field) & mask; } return features; } static inline u64 arm64_ftr_mask(const struct arm64_ftr_bits *ftrp) { return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift); } static inline u64 arm64_ftr_reg_user_value(const struct arm64_ftr_reg *reg) { return (reg->user_val | (reg->sys_val & reg->user_mask)); } static inline int __attribute_const__ cpuid_feature_extract_field_width(u64 features, int field, int width, bool sign) { if (WARN_ON_ONCE(!width)) width = 4; return (sign) ? cpuid_feature_extract_signed_field_width(features, field, width) : cpuid_feature_extract_unsigned_field_width(features, field, width); } static inline int __attribute_const__ cpuid_feature_extract_field(u64 features, int field, bool sign) { return cpuid_feature_extract_field_width(features, field, 4, sign); } static inline s64 arm64_ftr_value(const struct arm64_ftr_bits *ftrp, u64 val) { return (s64)cpuid_feature_extract_field_width(val, ftrp->shift, ftrp->width, ftrp->sign); } static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0) { return cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGEND_SHIFT) == 0x1 || cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT) == 0x1; } static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL1_SHIFT); return val == ID_AA64PFR0_EL1_EL1_AARCH32; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL0_SHIFT); return val == ID_AA64PFR0_EL1_EL0_AARCH32; } static inline bool id_aa64pfr0_sve(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SVE_SHIFT); return val > 0; } static inline bool id_aa64pfr1_sme(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_SME_SHIFT); return val > 0; } static inline bool id_aa64pfr0_mpam(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT); return val > 0; } static inline bool id_aa64pfr1_mte(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT); return val >= ID_AA64PFR1_EL1_MTE_MTE2; } void __init setup_boot_cpu_features(void); void __init setup_system_features(void); void __init setup_user_features(void); void check_local_cpu_capabilities(void); u64 read_sanitised_ftr_reg(u32 id); u64 __read_sysreg_by_encoding(u32 sys_id); static inline bool cpu_supports_mixed_endian_el0(void) { return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } static inline bool supports_csv2p3(int scope) { u64 pfr0; u8 csv2_val; if (scope == SCOPE_LOCAL_CPU) pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); else pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); csv2_val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT); return csv2_val == 3; } static inline bool supports_clearbhb(int scope) { u64 isar2; if (scope == SCOPE_LOCAL_CPU) isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); else isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); return cpuid_feature_extract_unsigned_field(isar2, ID_AA64ISAR2_EL1_CLRBHB_SHIFT); } const struct cpumask *system_32bit_el0_cpumask(void); const struct cpumask *fallback_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); static inline bool system_supports_32bit_el0(void) { u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); return static_branch_unlikely(&arm64_mismatched_32bit_el0) || id_aa64pfr0_32bit_el0(pfr0); } static inline bool system_supports_4kb_granule(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN4_SHIFT); return (val >= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN) && (val <= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX); } static inline bool system_supports_64kb_granule(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN64_SHIFT); return (val >= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN) && (val <= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX); } static inline bool system_supports_16kb_granule(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN16_SHIFT); return (val >= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN) && (val <= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX); } static inline bool system_supports_mixed_endian_el0(void) { return id_aa64mmfr0_mixed_endian_el0(read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1)); } static inline bool system_supports_mixed_endian(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGEND_SHIFT); return val == 0x1; } static __always_inline bool system_supports_fpsimd(void) { return alternative_has_cap_likely(ARM64_HAS_FPSIMD); } static inline bool system_uses_hw_pan(void) { return alternative_has_cap_unlikely(ARM64_HAS_PAN); } static inline bool system_uses_ttbr0_pan(void) { return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && !system_uses_hw_pan(); } static __always_inline bool system_supports_sve(void) { return alternative_has_cap_unlikely(ARM64_SVE); } static __always_inline bool system_supports_sme(void) { return alternative_has_cap_unlikely(ARM64_SME); } static __always_inline bool system_supports_sme2(void) { return alternative_has_cap_unlikely(ARM64_SME2); } static __always_inline bool system_supports_fa64(void) { return alternative_has_cap_unlikely(ARM64_SME_FA64); } static __always_inline bool system_supports_tpidr2(void) { return system_supports_sme(); } static __always_inline bool system_supports_fpmr(void) { return alternative_has_cap_unlikely(ARM64_HAS_FPMR); } static __always_inline bool system_supports_cnp(void) { return alternative_has_cap_unlikely(ARM64_HAS_CNP); } static inline bool system_supports_address_auth(void) { return cpus_have_final_boot_cap(ARM64_HAS_ADDRESS_AUTH); } static inline bool system_supports_generic_auth(void) { return alternative_has_cap_unlikely(ARM64_HAS_GENERIC_AUTH); } static inline bool system_has_full_ptr_auth(void) { return system_supports_address_auth() && system_supports_generic_auth(); } static __always_inline bool system_uses_irq_prio_masking(void) { return alternative_has_cap_unlikely(ARM64_HAS_GIC_PRIO_MASKING); } static inline bool system_supports_mte(void) { return alternative_has_cap_unlikely(ARM64_MTE); } static inline bool system_has_prio_mask_debugging(void) { return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) && system_uses_irq_prio_masking(); } static inline bool system_supports_bti(void) { return cpus_have_final_cap(ARM64_BTI); } static inline bool system_supports_bti_kernel(void) { return IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpus_have_final_boot_cap(ARM64_BTI); } static inline bool system_supports_tlb_range(void) { return alternative_has_cap_unlikely(ARM64_HAS_TLB_RANGE); } static inline bool system_supports_lpa2(void) { return cpus_have_final_cap(ARM64_HAS_LPA2); } static inline bool system_supports_poe(void) { return alternative_has_cap_unlikely(ARM64_HAS_S1POE); } static inline bool system_supports_gcs(void) { return alternative_has_cap_unlikely(ARM64_HAS_GCS); } static inline bool system_supports_haft(void) { return cpus_have_final_cap(ARM64_HAFT); } static __always_inline bool system_supports_mpam(void) { return alternative_has_cap_unlikely(ARM64_MPAM); } static __always_inline bool system_supports_mpam_hcr(void) { return alternative_has_cap_unlikely(ARM64_MPAM_HCR); } int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { case ID_AA64MMFR0_EL1_PARANGE_32: return 32; case ID_AA64MMFR0_EL1_PARANGE_36: return 36; case ID_AA64MMFR0_EL1_PARANGE_40: return 40; case ID_AA64MMFR0_EL1_PARANGE_42: return 42; case ID_AA64MMFR0_EL1_PARANGE_44: return 44; case ID_AA64MMFR0_EL1_PARANGE_48: return 48; case ID_AA64MMFR0_EL1_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme * for fields in ID registers", ARM DDI 0487C.a, any new * value is guaranteed to be higher than what we know already. * As a safe limit, we return the limit supported by the kernel. */ default: return CONFIG_ARM64_PA_BITS; } } /* Check whether hardware update of the Access flag is supported */ static inline bool cpu_has_hw_af(void) { u64 mmfr1; if (!IS_ENABLED(CONFIG_ARM64_HW_AFDBM)) return false; /* * Use cached version to avoid emulated msr operation on KVM * guests. */ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, ID_AA64MMFR1_EL1_HAFDBS_SHIFT); } static inline bool cpu_has_pan(void) { u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, ID_AA64MMFR1_EL1_PAN_SHIFT); } #ifdef CONFIG_ARM64_AMU_EXTN /* Check whether the cpu supports the Activity Monitors Unit (AMU) */ extern bool cpu_has_amu_feat(int cpu); #else static inline bool cpu_has_amu_feat(int cpu) { return false; } #endif /* Get a cpu that supports the Activity Monitors Unit (AMU) */ extern int get_cpu_with_amu_feat(void); static inline unsigned int get_vmid_bits(u64 mmfr1) { int vmid_bits; vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1, ID_AA64MMFR1_EL1_VMIDBits_SHIFT); if (vmid_bits == ID_AA64MMFR1_EL1_VMIDBits_16) return 16; /* * Return the default here even if any reserved * value is fetched from the system register. */ return 8; } s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur); struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); extern struct arm64_ftr_override id_aa64mmfr0_override; extern struct arm64_ftr_override id_aa64mmfr1_override; extern struct arm64_ftr_override id_aa64mmfr2_override; extern struct arm64_ftr_override id_aa64pfr0_override; extern struct arm64_ftr_override id_aa64pfr1_override; extern struct arm64_ftr_override id_aa64zfr0_override; extern struct arm64_ftr_override id_aa64smfr0_override; extern struct arm64_ftr_override id_aa64isar1_override; extern struct arm64_ftr_override id_aa64isar2_override; extern struct arm64_ftr_override arm64_sw_feature_override; static inline u64 arm64_apply_feature_override(u64 val, int feat, int width, const struct arm64_ftr_override *override) { u64 oval = override->val; /* * When it encounters an invalid override (e.g., an override that * cannot be honoured due to a missing CPU feature), the early idreg * override code will set the mask to 0x0 and the value to non-zero for * the field in question. In order to determine whether the override is * valid or not for the field we are interested in, we first need to * disregard bits belonging to other fields. */ oval &= GENMASK_ULL(feat + width - 1, feat); /* * The override is valid if all value bits are accounted for in the * mask. If so, replace the masked bits with the override value. */ if (oval == (oval & override->mask)) { val &= ~override->mask; val |= oval; } /* Extract the field from the updated value */ return cpuid_feature_extract_unsigned_field(val, feat); } static inline bool arm64_test_sw_feature_override(int feat) { /* * Software features are pseudo CPU features that have no underlying * CPUID system register value to apply the override to. */ return arm64_apply_feature_override(0, feat, 4, &arm64_sw_feature_override); } static inline bool kaslr_disabled_cmdline(void) { return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR); } u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); static inline bool cpu_has_bti(void) { if (!IS_ENABLED(CONFIG_ARM64_BTI)) return false; return arm64_apply_feature_override(read_cpuid(ID_AA64PFR1_EL1), ID_AA64PFR1_EL1_BT_SHIFT, 4, &id_aa64pfr1_override); } static inline bool cpu_has_pac(void) { u64 isar1, isar2; if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) return false; isar1 = read_cpuid(ID_AA64ISAR1_EL1); isar2 = read_cpuid(ID_AA64ISAR2_EL1); if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_APA_SHIFT, 4, &id_aa64isar1_override)) return true; if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_API_SHIFT, 4, &id_aa64isar1_override)) return true; return arm64_apply_feature_override(isar2, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, &id_aa64isar2_override); } static inline bool cpu_has_lva(void) { u64 mmfr2; mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); mmfr2 &= ~id_aa64mmfr2_override.mask; mmfr2 |= id_aa64mmfr2_override.val; return cpuid_feature_extract_unsigned_field(mmfr2, ID_AA64MMFR2_EL1_VARange_SHIFT); } static inline bool cpu_has_lpa2(void) { #ifdef CONFIG_ARM64_LPA2 u64 mmfr0; int feat; mmfr0 = read_sysreg(id_aa64mmfr0_el1); mmfr0 &= ~id_aa64mmfr0_override.mask; mmfr0 |= id_aa64mmfr0_override.val; feat = cpuid_feature_extract_signed_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_SHIFT); return feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2; #else return false; #endif } #endif /* __ASSEMBLY__ */ #endif
12 309 309 2 2 344 344 280 280 199 160 160 160 160 158 158 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGE_REF_H #define _LINUX_PAGE_REF_H #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/tracepoint-defs.h> DECLARE_TRACEPOINT(page_ref_set); DECLARE_TRACEPOINT(page_ref_mod); DECLARE_TRACEPOINT(page_ref_mod_and_test); DECLARE_TRACEPOINT(page_ref_mod_and_return); DECLARE_TRACEPOINT(page_ref_mod_unless); DECLARE_TRACEPOINT(page_ref_freeze); DECLARE_TRACEPOINT(page_ref_unfreeze); #ifdef CONFIG_DEBUG_PAGE_REF /* * Ideally we would want to use the trace_<tracepoint>_enabled() helper * functions. But due to include header file issues, that is not * feasible. Instead we have to open code the static key functions. * * See trace_##name##_enabled(void) in include/linux/tracepoint.h */ #define page_ref_tracepoint_active(t) tracepoint_enabled(t) extern void __page_ref_set(struct page *page, int v); extern void __page_ref_mod(struct page *page, int v); extern void __page_ref_mod_and_test(struct page *page, int v, int ret); extern void __page_ref_mod_and_return(struct page *page, int v, int ret); extern void __page_ref_mod_unless(struct page *page, int v, int u); extern void __page_ref_freeze(struct page *page, int v, int ret); extern void __page_ref_unfreeze(struct page *page, int v); #else #define page_ref_tracepoint_active(t) false static inline void __page_ref_set(struct page *page, int v) { } static inline void __page_ref_mod(struct page *page, int v) { } static inline void __page_ref_mod_and_test(struct page *page, int v, int ret) { } static inline void __page_ref_mod_and_return(struct page *page, int v, int ret) { } static inline void __page_ref_mod_unless(struct page *page, int v, int u) { } static inline void __page_ref_freeze(struct page *page, int v, int ret) { } static inline void __page_ref_unfreeze(struct page *page, int v) { } #endif static inline int page_ref_count(const struct page *page) { return atomic_read(&page->_refcount); } /** * folio_ref_count - The reference count on this folio. * @folio: The folio. * * The refcount is usually incremented by calls to folio_get() and * decremented by calls to folio_put(). Some typical users of the * folio refcount: * * - Each reference from a page table * - The page cache * - Filesystem private data * - The LRU list * - Pipes * - Direct IO which references this page in the process address space * * Return: The number of references to this folio. */ static inline int folio_ref_count(const struct folio *folio) { return page_ref_count(&folio->page); } static inline int page_count(const struct page *page) { return folio_ref_count(page_folio(page)); } static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_refcount, v); if (page_ref_tracepoint_active(page_ref_set)) __page_ref_set(page, v); } static inline void folio_set_count(struct folio *folio, int v) { set_page_count(&folio->page, v); } /* * Setup the page count before being freed into the page allocator for * the first time (boot or memory hotplug) */ static inline void init_page_count(struct page *page) { set_page_count(page, 1); } static inline void page_ref_add(struct page *page, int nr) { atomic_add(nr, &page->_refcount); if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, nr); } static inline void folio_ref_add(struct folio *folio, int nr) { page_ref_add(&folio->page, nr); } static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_refcount); if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, -nr); } static inline void folio_ref_sub(struct folio *folio, int nr) { page_ref_sub(&folio->page, nr); } static inline int folio_ref_sub_return(struct folio *folio, int nr) { int ret = atomic_sub_return(nr, &folio->_refcount); if (page_ref_tracepoint_active(page_ref_mod_and_return)) __page_ref_mod_and_return(&folio->page, -nr, ret); return ret; } static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_refcount); if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, 1); } static inline void folio_ref_inc(struct folio *folio) { page_ref_inc(&folio->page); } static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_refcount); if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, -1); } static inline void folio_ref_dec(struct folio *folio) { page_ref_dec(&folio->page); } static inline int page_ref_sub_and_test(struct page *page, int nr) { int ret = atomic_sub_and_test(nr, &page->_refcount); if (page_ref_tracepoint_active(page_ref_mod_and_test)) __page_ref_mod_and_test(page, -nr, ret); return ret; } static inline int folio_ref_sub_and_test(struct folio *folio, int nr) { return page_ref_sub_and_test(&folio->page, nr); } static inline int page_ref_inc_return(struct page *page) { int ret = atomic_inc_return(&page->_refcount); if (page_ref_tracepoint_active(page_ref_mod_and_return)) __page_ref_mod_and_return(page, 1, ret); return ret; } static inline int folio_ref_inc_return(struct folio *folio) { return page_ref_inc_return(&folio->page); } static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); if (page_ref_tracepoint_active(page_ref_mod_and_test)) __page_ref_mod_and_test(page, -1, ret); return ret; } static inline int folio_ref_dec_and_test(struct folio *folio) { return page_ref_dec_and_test(&folio->page); } static inline int page_ref_dec_return(struct page *page) { int ret = atomic_dec_return(&page->_refcount); if (page_ref_tracepoint_active(page_ref_mod_and_return)) __page_ref_mod_and_return(page, -1, ret); return ret; } static inline int folio_ref_dec_return(struct folio *folio) { return page_ref_dec_return(&folio->page); } static inline bool page_ref_add_unless(struct page *page, int nr, int u) { bool ret = false; rcu_read_lock(); /* avoid writing to the vmemmap area being remapped */ if (!page_is_fake_head(page) && page_ref_count(page) != u) ret = atomic_add_unless(&page->_refcount, nr, u); rcu_read_unlock(); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) { return page_ref_add_unless(&folio->page, nr, u); } /** * folio_try_get - Attempt to increase the refcount on a folio. * @folio: The folio. * * If you do not already have a reference to a folio, you can attempt to * get one using this function. It may fail if, for example, the folio * has been freed since you found a pointer to it, or it is frozen for * the purposes of splitting or migration. * * Return: True if the reference count was successfully incremented. */ static inline bool folio_try_get(struct folio *folio) { return folio_ref_add_unless(folio, 1, 0); } static inline bool folio_ref_try_add(struct folio *folio, int count) { return folio_ref_add_unless(folio, count, 0); } static inline int page_ref_freeze(struct page *page, int count) { int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); if (page_ref_tracepoint_active(page_ref_freeze)) __page_ref_freeze(page, count, ret); return ret; } static inline int folio_ref_freeze(struct folio *folio, int count) { return page_ref_freeze(&folio->page, count); } static inline void page_ref_unfreeze(struct page *page, int count) { VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); atomic_set_release(&page->_refcount, count); if (page_ref_tracepoint_active(page_ref_unfreeze)) __page_ref_unfreeze(page, count); } static inline void folio_ref_unfreeze(struct folio *folio, int count) { page_ref_unfreeze(&folio->page, count); } #endif
397 400 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 // SPDX-License-Identifier: GPL-2.0-only #include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> #include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> struct wb_stats { unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh; }; static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static void collect_wb_stats(struct wb_stats *stats, struct bdi_writeback *wb) { struct inode *inode; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); stats->nr_written += wb_stat(wb, WB_WRITTEN); stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); } #ifdef CONFIG_CGROUP_WRITEBACK static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { if (!wb_tryget(wb)) continue; collect_wb_stats(stats, wb); wb_put(wb); } rcu_read_unlock(); } #else static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { collect_wb_stats(stats, &bdi->wb); } #endif static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct wb_stats stats; unsigned long tot_bw; global_dirty_limits(&background_thresh, &dirty_thresh); memset(&stats, 0, sizeof(stats)); stats.dirty_thresh = dirty_thresh; bdi_collect_stats(bdi, &stats); tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", K(stats.nr_writeback), K(stats.nr_reclaimable), K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), K(stats.nr_dirtied), K(stats.nr_written), K(tot_bw), stats.nr_dirty, stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, struct wb_stats *stats) { seq_printf(m, "WbCgIno: %10lu\n" "WbWriteback: %10lu kB\n" "WbReclaimable: %10lu kB\n" "WbDirtyThresh: %10lu kB\n" "WbDirtied: %10lu kB\n" "WbWritten: %10lu kB\n" "WbWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "state: %10lx\n\n", #ifdef CONFIG_CGROUP_WRITEBACK cgroup_ino(wb->memcg_css->cgroup), #else 1ul, #endif K(stats->nr_writeback), K(stats->nr_reclaimable), K(stats->wb_thresh), K(stats->nr_dirtied), K(stats->nr_written), K(wb->avg_write_bandwidth), stats->nr_dirty, stats->nr_io, stats->nr_more_io, stats->nr_dirty_time, wb->state); } static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct bdi_writeback *wb; global_dirty_limits(&background_thresh, &dirty_thresh); rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { struct wb_stats stats = { .dirty_thresh = dirty_thresh }; if (!wb_tryget(wb)) continue; collect_wb_stats(&stats, wb); /* * Calculate thresh of wb in writeback cgroup which is min of * thresh in global domain and thresh in cgroup domain. Drop * rcu lock because cgwb_calc_thresh may sleep in * cgroup_rstat_flush. We can do so here because we have a ref. */ if (mem_cgroup_wb_domain(wb)) { rcu_read_unlock(); stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); rcu_read_lock(); } wb_stats_show(m, wb, &stats); wb_put(wb); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio_fine, bdi->max_ratio) static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); } static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(min_bytes); static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); } static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; } static ssize_t strict_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%d\n", !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); } static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static const struct class bdi_class = { .name = "bdi", .dev_groups = bdi_dev_groups, }; static __init int bdi_class_init(void) { int ret; ret = class_register(&bdi_class); if (ret) return ret; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int __init default_bdi_init(void) { bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; return 0; } subsys_initcall(default_bdi_init); static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, bw_dwork); wb_update_bandwidth(wb); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); if (err) fprop_local_destroy_percpu(&wb->completions); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_irq(&wb->work_lock); return; } spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { WARN_ON(delayed_work_pending(&wb->dwork)); percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_free_rcu(struct rcu_head *rcu_head) { struct bdi_writeback *wb = container_of(rcu_head, struct bdi_writeback, rcu); percpu_ref_exit(&wb->refcnt); kfree(wb); } static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_alloc(gfp); do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /* * cleanup_offline_cgwbs_workfn - try to release dying cgwbs * * Try to release dying cgwbs by switching attached inodes to the nearest * living ancestor's writeback. Processed wbs are placed at the end * of the list to guarantee the forward progress. */ static void cleanup_offline_cgwbs_workfn(struct work_struct *work) { struct bdi_writeback *wb; LIST_HEAD(processed); spin_lock_irq(&cgwb_lock); while (!list_empty(&offline_cgwbs)) { wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, offline_node); list_move(&wb->offline_node, &processed); /* * If wb is dirty, cleaning up the writeback by switching * attached inodes will result in an effective removal of any * bandwidth restrictions, which isn't the goal. Instead, * it can be postponed until the next time, when all io * will be likely completed. If in the meantime some inodes * will get re-dirtied, they should be eventually switched to * a new cgwb. */ if (wb_has_dirty_io(wb)) continue; if (!wb_tryget(wb)) continue; spin_unlock_irq(&cgwb_lock); while (cleanup_offline_cgwb(wb)) cond_resched(); spin_lock_irq(&cgwb_lock); wb_put(wb); } if (!list_empty(&processed)) list_splice_tail(&processed, &offline_cgwbs); spin_unlock_irq(&cgwb_lock); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { del_timer_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to * update the global bdi_min_ratio. */ if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb; if (!inode) return &noop_backing_dev_info; sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } EXPORT_SYMBOL(inode_to_bdi); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name);
105 105 130 210 36 877 786 11 210 890 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/cmpxchg.h * * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_CMPXCHG_H #define __ASM_CMPXCHG_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <asm/barrier.h> #include <asm/lse.h> /* * We need separate acquire parameters for ll/sc and lse, since the full * barrier case is generated as release+dmb for the former and * acquire+release for the latter. */ #define __XCHG_CASE(w, sfx, name, sz, mb, nop_lse, acq, acq_lse, rel, cl) \ static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr) \ { \ u##sz ret; \ unsigned long tmp; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ /* LL/SC */ \ " prfm pstl1strm, %2\n" \ "1: ld" #acq "xr" #sfx "\t%" #w "0, %2\n" \ " st" #rel "xr" #sfx "\t%w1, %" #w "3, %2\n" \ " cbnz %w1, 1b\n" \ " " #mb, \ /* LSE atomics */ \ " swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n" \ __nops(3) \ " " #nop_lse) \ : "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr) \ : "r" (x) \ : cl); \ \ return ret; \ } __XCHG_CASE(w, b, , 8, , , , , , ) __XCHG_CASE(w, h, , 16, , , , , , ) __XCHG_CASE(w, , , 32, , , , , , ) __XCHG_CASE( , , , 64, , , , , , ) __XCHG_CASE(w, b, acq_, 8, , , a, a, , "memory") __XCHG_CASE(w, h, acq_, 16, , , a, a, , "memory") __XCHG_CASE(w, , acq_, 32, , , a, a, , "memory") __XCHG_CASE( , , acq_, 64, , , a, a, , "memory") __XCHG_CASE(w, b, rel_, 8, , , , , l, "memory") __XCHG_CASE(w, h, rel_, 16, , , , , l, "memory") __XCHG_CASE(w, , rel_, 32, , , , , l, "memory") __XCHG_CASE( , , rel_, 64, , , , , l, "memory") __XCHG_CASE(w, b, mb_, 8, dmb ish, nop, , a, l, "memory") __XCHG_CASE(w, h, mb_, 16, dmb ish, nop, , a, l, "memory") __XCHG_CASE(w, , mb_, 32, dmb ish, nop, , a, l, "memory") __XCHG_CASE( , , mb_, 64, dmb ish, nop, , a, l, "memory") #undef __XCHG_CASE #define __XCHG_GEN(sfx) \ static __always_inline unsigned long \ __arch_xchg##sfx(unsigned long x, volatile void *ptr, int size) \ { \ switch (size) { \ case 1: \ return __xchg_case##sfx##_8(x, ptr); \ case 2: \ return __xchg_case##sfx##_16(x, ptr); \ case 4: \ return __xchg_case##sfx##_32(x, ptr); \ case 8: \ return __xchg_case##sfx##_64(x, ptr); \ default: \ BUILD_BUG(); \ } \ \ unreachable(); \ } __XCHG_GEN() __XCHG_GEN(_acq) __XCHG_GEN(_rel) __XCHG_GEN(_mb) #undef __XCHG_GEN #define __xchg_wrapper(sfx, ptr, x) \ ({ \ __typeof__(*(ptr)) __ret; \ __ret = (__typeof__(*(ptr))) \ __arch_xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr))); \ __ret; \ }) /* xchg */ #define arch_xchg_relaxed(...) __xchg_wrapper( , __VA_ARGS__) #define arch_xchg_acquire(...) __xchg_wrapper(_acq, __VA_ARGS__) #define arch_xchg_release(...) __xchg_wrapper(_rel, __VA_ARGS__) #define arch_xchg(...) __xchg_wrapper( _mb, __VA_ARGS__) #define __CMPXCHG_CASE(name, sz) \ static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr, \ u##sz old, \ u##sz new) \ { \ return __lse_ll_sc_body(_cmpxchg_case_##name##sz, \ ptr, old, new); \ } __CMPXCHG_CASE( , 8) __CMPXCHG_CASE( , 16) __CMPXCHG_CASE( , 32) __CMPXCHG_CASE( , 64) __CMPXCHG_CASE(acq_, 8) __CMPXCHG_CASE(acq_, 16) __CMPXCHG_CASE(acq_, 32) __CMPXCHG_CASE(acq_, 64) __CMPXCHG_CASE(rel_, 8) __CMPXCHG_CASE(rel_, 16) __CMPXCHG_CASE(rel_, 32) __CMPXCHG_CASE(rel_, 64) __CMPXCHG_CASE(mb_, 8) __CMPXCHG_CASE(mb_, 16) __CMPXCHG_CASE(mb_, 32) __CMPXCHG_CASE(mb_, 64) #undef __CMPXCHG_CASE #define __CMPXCHG128(name) \ static inline u128 __cmpxchg128##name(volatile u128 *ptr, \ u128 old, u128 new) \ { \ return __lse_ll_sc_body(_cmpxchg128##name, \ ptr, old, new); \ } __CMPXCHG128( ) __CMPXCHG128(_mb) #undef __CMPXCHG128 #define __CMPXCHG_GEN(sfx) \ static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr, \ unsigned long old, \ unsigned long new, \ int size) \ { \ switch (size) { \ case 1: \ return __cmpxchg_case##sfx##_8(ptr, old, new); \ case 2: \ return __cmpxchg_case##sfx##_16(ptr, old, new); \ case 4: \ return __cmpxchg_case##sfx##_32(ptr, old, new); \ case 8: \ return __cmpxchg_case##sfx##_64(ptr, old, new); \ default: \ BUILD_BUG(); \ } \ \ unreachable(); \ } __CMPXCHG_GEN() __CMPXCHG_GEN(_acq) __CMPXCHG_GEN(_rel) __CMPXCHG_GEN(_mb) #undef __CMPXCHG_GEN #define __cmpxchg_wrapper(sfx, ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ __ret = (__typeof__(*(ptr))) \ __cmpxchg##sfx((ptr), (unsigned long)(o), \ (unsigned long)(n), sizeof(*(ptr))); \ __ret; \ }) /* cmpxchg */ #define arch_cmpxchg_relaxed(...) __cmpxchg_wrapper( , __VA_ARGS__) #define arch_cmpxchg_acquire(...) __cmpxchg_wrapper(_acq, __VA_ARGS__) #define arch_cmpxchg_release(...) __cmpxchg_wrapper(_rel, __VA_ARGS__) #define arch_cmpxchg(...) __cmpxchg_wrapper( _mb, __VA_ARGS__) #define arch_cmpxchg_local arch_cmpxchg_relaxed /* cmpxchg64 */ #define arch_cmpxchg64_relaxed arch_cmpxchg_relaxed #define arch_cmpxchg64_acquire arch_cmpxchg_acquire #define arch_cmpxchg64_release arch_cmpxchg_release #define arch_cmpxchg64 arch_cmpxchg #define arch_cmpxchg64_local arch_cmpxchg_local /* cmpxchg128 */ #define system_has_cmpxchg128() 1 #define arch_cmpxchg128(ptr, o, n) \ ({ \ __cmpxchg128_mb((ptr), (o), (n)); \ }) #define arch_cmpxchg128_local(ptr, o, n) \ ({ \ __cmpxchg128((ptr), (o), (n)); \ }) #define __CMPWAIT_CASE(w, sfx, sz) \ static inline void __cmpwait_case_##sz(volatile void *ptr, \ unsigned long val) \ { \ unsigned long tmp; \ \ asm volatile( \ " sevl\n" \ " wfe\n" \ " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \ " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \ " cbnz %" #w "[tmp], 1f\n" \ " wfe\n" \ "1:" \ : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \ : [val] "r" (val)); \ } __CMPWAIT_CASE(w, b, 8); __CMPWAIT_CASE(w, h, 16); __CMPWAIT_CASE(w, , 32); __CMPWAIT_CASE( , , 64); #undef __CMPWAIT_CASE #define __CMPWAIT_GEN(sfx) \ static __always_inline void __cmpwait##sfx(volatile void *ptr, \ unsigned long val, \ int size) \ { \ switch (size) { \ case 1: \ return __cmpwait_case##sfx##_8(ptr, (u8)val); \ case 2: \ return __cmpwait_case##sfx##_16(ptr, (u16)val); \ case 4: \ return __cmpwait_case##sfx##_32(ptr, val); \ case 8: \ return __cmpwait_case##sfx##_64(ptr, val); \ default: \ BUILD_BUG(); \ } \ \ unreachable(); \ } __CMPWAIT_GEN() #undef __CMPWAIT_GEN #define __cmpwait_relaxed(ptr, val) \ __cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr))) #endif /* __ASM_CMPXCHG_H */
340 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <linux/rhashtable-types.h> #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; /* * For AUX area events, aux_paused cannot be a state * flag because it can be updated asynchronously to * state. */ unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ #define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ #define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. * * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with * PERF_EF_RESUME. * * ->start() with PERF_EF_RESUME will start as simply as possible but * only if the counter is not otherwise stopped. Will not overlap * another ->start() with PERF_EF_RESUME nor ->stop() with * PERF_EF_PAUSE. * * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ __u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to avoid freeing per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) return; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) return; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (!has_branch_stack(event)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) return; if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif #ifndef perf_arch_guest_misc_flags static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) { return event->attr.aux_sample_size || event->attr.aux_pause || event->attr.aux_resume; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */
167 43 136 136 136 43 43 43 43 43 43 239 240 239 241 241 23 23 23 241 241 241 239 241 241 240 251 253 251 253 252 251 190 16 191 44 43 28 121 230 230 251 253 25 233 184 26 165 189 67 190 190 190 189 190 190 190 190 280 249 279 280 136 126 41 255 136 255 136 42 2 42 42 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; struct cpu_fbatches { /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch lru_activate; #endif /* Protecting the following batches which require disabling interrupts */ local_lock_t lock_irq; struct folio_batch lru_move_tail; }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { if (folio_test_lru(folio)) { folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); } } /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void page_cache_release(struct folio *folio) { struct lruvec *lruvec = NULL; unsigned long flags; __page_cache_release(folio, &lruvec, &flags); if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; } if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch); } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, struct folio *folio, move_fn_t move_fn, bool on_lru, bool disable_irq) { unsigned long flags; if (on_lru && !folio_test_clear_lru(folio)) return; folio_get(folio); if (disable_irq) local_lock_irqsave(&cpu_fbatches.lock_irq, flags); else local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); else local_unlock(&cpu_fbatches.lock); } #define folio_batch_add_and_move(folio, op, on_lru) \ __folio_batch_add_and_move( \ &cpu_fbatches.op, \ folio, \ op, \ on_lru, \ offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { if (folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_move_tail, true); } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_active(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { if (folio_test_active(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_activate, true); } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (!folio_test_clear_lru(folio)) return; lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } do { if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { if (!folio_test_workingset(folio)) folio_set_workingset(folio); return; } new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) { struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); if (gen < 0) return true; set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); } #else /* !CONFIG_LRU_GEN */ static void lru_gen_inc_refs(struct folio *folio) { } static bool lru_gen_clear_refs(struct folio *folio) { return false; } #endif /* CONFIG_LRU_GEN */ /** * folio_mark_accessed - Mark a folio as having seen activity. * @folio: The folio to mark. * * This function will perform one of the following transitions: * * * inactive,unreferenced -> inactive,referenced * * inactive,referenced -> active,unreferenced * * active,unreferenced -> active,referenced * * When a newly allocated folio is not yet visible, so safe for non-atomic ops, * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { if (folio_test_dropbehind(folio)) return; if (lru_gen_enabled()) { lru_gen_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_batch_add_and_move(folio, lru_add, false); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); if (lru_gen_enabled()) lru_gen_clear_refs(folio); else folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add); fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&cpu_fbatches.lock_irq, flags); folio_batch_move_lru(fbatch, lru_move_tail); local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; folio_batch_add_and_move(folio, lru_deactivate_file, true); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_lazyfree, true); } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of folios to be migrated using folio_isolate_lru(). * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need * to reinitialise it. If @refs is NULL, we subtract one from each * folio refcount. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); continue; } folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; } folios->nr = j; mem_cgroup_uncharge_folios(folios); free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; int refs[PAGEVEC_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; folio_batch_init(&fbatch); for (i = 0; i < nr; i++) { /* Turn any of the argument types into a folio */ struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ refs[fbatch.nr] = 1; if (unlikely(encoded_page_flags(encoded[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); if (folio_batch_add(&fbatch, folio) > 0) continue; folios_put_refs(&fbatch, refs); } if (fbatch.nr) folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ }
44 44 44 44 44 44 5 44 44 44 44 42 44 44 5 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0-only #include <linux/stat.h> #include <linux/sysctl.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/hash.h> #include <linux/kmemleak.h> #include <linux/user_namespace.h> struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ UCOUNTS_HASHTABLE_BITS) #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) { return &current_user_ns()->set; } static int set_is_seen(struct ctl_table_set *set) { return &current_user_ns()->set == set; } static int set_permissions(struct ctl_table_header *head, const struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ if (ns_capable(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ mode = table->mode & S_IROTH; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = set_permissions, }; static long ue_zero = 0; static long ue_int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(long), \ .mode = 0644, \ .proc_handler = proc_doulongvec_minmax, \ .extra1 = &ue_zero, \ .extra2 = &ue_int_max, \ } static const struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), UCOUNT_ENTRY("max_time_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif #ifdef CONFIG_FANOTIFY UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif }; #endif /* CONFIG_SYSCTL */ bool setup_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < UCOUNT_COUNTS; i++) { tbl[i].data = &ns->ucount_max[i]; } ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl, ARRAY_SIZE(user_table)); } if (!ns->sysctls) { kfree(tbl); retire_sysctl_set(&ns->set); return false; } #endif return true; } void retire_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; tbl = ns->sysctls->ctl_table_arg; unregister_sysctl_table(ns->sysctls); retire_sysctl_set(&ns->set); kfree(tbl); #endif } static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) { struct ucounts *ucounts; hlist_for_each_entry(ucounts, hashent, node) { if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) return ucounts; } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); spin_lock_irq(&ucounts_lock); hlist_add_head(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) { /* Returns true on a successful get, false if the count wraps. */ return !atomic_add_negative(1, &ucounts->count); } struct ucounts *get_ucounts(struct ucounts *ucounts) { if (!get_ucounts_or_wrap(ucounts)) { put_ucounts(ucounts); ucounts = NULL; } return ucounts; } struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); bool wrapped; struct ucounts *ucounts, *new = NULL; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (!ucounts) { spin_unlock_irq(&ucounts_lock); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->ns = ns; new->uid = uid; atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (!ucounts) { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); kfree(new); if (wrapped) { put_ucounts(ucounts); return NULL; } return ucounts; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); put_user_ns(ucounts->ns); kfree(ucounts); } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { long c, old; c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; } } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type) { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; } void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(v, &iter->rlimit[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); } return ret; } bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_sub_return(v, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); if (iter == ucounts) new = dec; } return (new == 0); } static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { long dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); next = iter->ns->ucounts; if (dec == 0) put_ucounts(iter); } } void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) { do_dec_rlimit_put_ucounts(ucounts, NULL, type); } long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) goto dec_unwind; if (iter == ucounts) ret = new; if (!override_rlimit) max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. */ if (new != 1) continue; if (!get_ucounts(iter)) goto dec_unwind; } return ret; dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long val = get_rlimit_value(iter, type); if (val < 0 || val > max) return true; max = get_userns_rlimit_max(iter->ns, type); } return false; } static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL static struct ctl_table_header *user_header; static struct ctl_table empty[1]; /* * It is necessary to register the user directory in the * default set so that registrations in the child sets work * properly. */ user_header = register_sysctl_sz("user", empty, 0); kmemleak_ignore(user_header); BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init);
23 23 23 23 23 23 23 23 23 23 21 21 21 21 21 1 21 21 21 1 21 21 21 21 21 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 21 21 21 21 21 21 23 23 21 21 21 23 23 23 23 23 23 23 23 23 21 23 23 23 23 23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); EXPORT_TRACEPOINT_SYMBOL_GPL(console); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_mutex protects console_list updates and console->flags updates. * The flags are synchronized only for consoles that are registered, i.e. * accessible via the console list. */ static DEFINE_MUTEX(console_mutex); /* * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; void lockdep_assert_console_list_lock_held(void) { lockdep_assert_held(&console_mutex); } EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strscpy(devkmsg_log_str, old_str); return -EINVAL; } } return 0; } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /** * console_list_lock - Lock the console list * * For console list or console->flags updates */ void console_list_lock(void) { /* * In unregister_console() and console_force_preferred_locked(), * synchronize_srcu() is called with the console_list_lock held. * Therefore it is not allowed that the console_list_lock is taken * with the srcu_lock held. * * Detecting if this context is really in the read-side critical * section is only possible if the appropriate debug options are * enabled. */ WARN_ON_ONCE(debug_lockdep_rcu_enabled() && srcu_read_lock_held(&console_srcu)); mutex_lock(&console_mutex); } EXPORT_SYMBOL(console_list_lock); /** * console_list_unlock - Unlock the console list * * Counterpart to console_list_lock() */ void console_list_unlock(void) { mutex_unlock(&console_mutex); } EXPORT_SYMBOL(console_list_unlock); /** * console_srcu_read_lock - Register a new reader for the * SRCU-protected console list * * Use for_each_console_srcu() to iterate the console list * * Context: Any context. * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock); /** * console_srcu_read_unlock - Unregister an old reader from * the SRCU-protected console list * @cookie: cookie returned from console_srcu_read_lock() * * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) static bool panic_in_progress(void) { return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { /* * We can use raw_smp_processor_id() here because it is impossible for * the task to be migrated to the panic_cpu, or away from it. If * panic_cpu has already been set, and we're not currently executing on * that CPU, then we never will be. */ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); } /* * Return true if a panic is in progress on a remote CPU. * * On true, the local CPU should immediately release any printing resources * that may be needed by the panic CPU. */ bool other_cpu_in_panic(void) { return (panic_in_progress() && !this_cpu_in_panic()); } /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); /* * Specifies if a legacy console is registered. If legacy consoles are * present, it is necessary to perform the console lock/unlock dance * whenever console flushing should occur. */ bool have_legacy_console; /* * Specifies if an nbcon console is registered. If nbcon consoles are present, * synchronous printing of legacy consoles will not occur during panic until * the backtrace has been stored to the ringbuffer. */ bool have_nbcon_console; /* * Specifies if a boot console is registered. If boot consoles are present, * nbcon consoles cannot print simultaneously and must be synchronized by * the console lock. This is because boot consoles and nbcon consoles may * have mapped the same hardware. */ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* True when _all_ printer threads are available for printing. */ bool printk_kthreads_running; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; write_seqcount_latch(&ls->latch); ls->val[1] = val; write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; struct printk_buffers pbufs; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; char *outbuf = &user->pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &user->pbufs, }; ssize_t ret; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } atomic64_set(&user->seq, pmsg.seq + 1); if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kvfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; static void print_log_buf_usage_stats(void) { unsigned int descs_count = log_buf_len >> PRB_AVGBITS; size_t meta_data_size; meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", log_buf_len, meta_data_size, log_buf_len + meta_data_size); } void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) { /* Show the memory stats only once. */ if (!early) goto out; return; } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); out: print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; bool suppress = !is_printk_force_console() && suppress_message_printing(level); if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. * Non-panic CPUs abandon the flush anyway. * * Just keep the lockdep annotation. The panic-CPU should avoid * taking console_owner_lock because it might cause a deadlock. * This looks like the easiest way how to prevent false lockdep * reports without handling races a lockless way. */ if (panic_in_progress()) goto lockdep; raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose both the console_lock and the SRCU read lock if * there was a busy waiter. They must not touch items synchronized by * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ int console_lock_spinning_disable_and_check(int cookie) { int waiter; /* * Ignore spinning waiters during panic() because they might get stopped * or blocked at any time, * * It is safe because nobody is allowed to start spinning during panic * in the first place. If there has been a waiter then non panic CPUs * might stay spinning. They would get stopped anyway. The panic context * will never start spinning and an interrupted spin on panic CPU will * never continue. */ if (panic_in_progress()) { /* Keep lockdep happy. */ spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Preserve lockdep lock ordering. Release the SRCU read lock before * releasing the console_lock. */ console_srcu_read_unlock(cookie); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; /* * It's unsafe to spin once a panic has begun. If we are the * panic CPU, we may have already halted the owner of the * console_sem. If we are not the panic CPU, then we should * avoid taking console_sem, so the panic CPU has a better * chance of cleanly acquiring it later. */ if (panic_in_progress()) return 0; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); /* * Update @console_may_schedule for trylock because the previous * owner may have been schedulable. */ console_may_schedule = 0; return 1; } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(int level) { boot_delay_msec(level); if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } __printf(5, 0) static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } trace_console(text, text_len); return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > PRINTKRB_RECORD_MAX) reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (is_printk_force_console()) flags |= LOG_FORCE_CON; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_FORCE_CON) r.info->flags |= LOG_FORCE_CON; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } /* * This acts as a one-way switch to allow legacy consoles to print from * the printk() caller context on a panic CPU. It also attempts to flush * the legacy consoles in this context. */ void printk_legacy_allow_panic_sync(void) { struct console_flush_type ft; legacy_allow_panic_sync = true; printk_get_console_flush_type(&ft); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct console_flush_type ft; int printed_len; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; /* * The messages on the panic CPU are the most important. If * non-panic CPUs are generating any messages, they will be * silently dropped. */ if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; ft.legacy_offload |= ft.legacy_direct; ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 static u64 syslog_seq; static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static void set_user_specified(struct console_cmdline *c, bool user_specified) { if (!user_specified) return; /* * @c console was defined by the user on the command line. * Do not clear when added twice also by SPCR or the device tree. */ c->user_specified = true; /* At least one console defined by the user on the command line. */ console_set_on_cmdline = 1; } static int __add_preferred_console(const char *name, const short idx, const char *devname, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; if (!name && !devname) return -EINVAL; /* * We use a signed short index for struct console for device drivers to * indicate a not yet assigned index or port. However, a negative index * value is not valid when the console name and index are defined on * the command line. */ if (name && idx < 0) return -EINVAL; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if ((name && strcmp(c->name, name) == 0 && c->index == idx) || (devname && strcmp(c->devname, devname) == 0)) { if (!brl_options) preferred_console = i; set_user_specified(c, user_specified); return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; if (name) strscpy(c->name, name); if (devname) strscpy(c->devname, devname); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4); char buf[sizeof(console_cmdline[0].devname)]; char *brl_options = NULL; char *ttyname = NULL; char *devname = NULL; char *options; char *s; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* For a DEVNAME:0.0 style console the character device is unknown early */ if (strchr(str, ':')) devname = buf; else ttyname = buf; /* * Decode str into name, index, options. */ if (ttyname && isdigit(str[0])) scnprintf(buf, sizeof(buf), "ttyS%s", str); else strscpy(buf, str); options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strscpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if ((ttyname && isdigit(*s)) || *s == ',') break; /* @idx will get defined when devname matches. */ if (devname) idx = -1; else idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(ttyname, idx, devname, options, brl_options, true); return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, NULL, options, NULL, false); } /** * match_devname_and_update_preferred_console - Update a preferred console * when matching devname is found. * @devname: DEVNAME:0.0 style device name * @name: Name of the corresponding console driver, e.g. "ttyS" * @idx: Console index, e.g. port number. * * The function checks whether a device with the given @devname is * preferred via the console=DEVNAME:0.0 command line option. * It fills the missing console driver name and console index * so that a later register_console() call could find (match) * and enable this device. * * It might be used when a driver subsystem initializes particular * devices with already known DEVNAME:0.0 style names. And it * could predict which console driver name and index this device * would later get associated with. * * Return: 0 on success, negative error code on failure. */ int match_devname_and_update_preferred_console(const char *devname, const char *name, const short idx) { struct console_cmdline *c = console_cmdline; int i; if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0) return -EINVAL; for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if (!strcmp(devname, c->devname)) { pr_info("associate the preferred console \"%s\" with \"%s%d\"\n", devname, name, idx); strscpy(c->name, name); c->index = idx; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * suspend_console - suspend the console subsystem * * This disables printk() while we go into suspend states */ void suspend_console(void) { struct console *con; if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags | CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see that they are suspended so that it * is guaranteed that all printing has stopped when this function * completes. */ synchronize_srcu(&console_srcu); } void resume_console(void) { struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) return; console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see they are no longer suspended so * that they are guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); pr_flush(1000, true); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { struct console_flush_type ft; if (!cpuhp_tasks_frozen) { printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } return 0; } /** * console_lock - block the console subsystem from printing * * Acquires a lock which guarantees that no consoles will * be in or enter their write() callback. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ while (other_cpu_in_panic()) msleep(1000); down_console_sem(); console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to block the console subsystem from printing * * Try to acquire a lock which guarantees that no consoles will * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ if (other_cpu_in_panic()) return 0; if (down_trylock_console_sem()) return 0; console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); static void __console_unlock(void) { console_locked = 0; up_console_sem(); } #ifdef CONFIG_PRINTK /* * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting * the existing message over and inserting the scratchbuf message. * * @pmsg is the original printk message. * @fmt is the printf format of the message which will prepend the existing one. * * If there is not enough space in @pmsg->pbufs->outbuf, the existing * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ __printf(2, 3) static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; va_list args; size_t len; va_start(args, fmt); len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); va_end(args); /* * Make sure outbuf is sufficiently large before prepending. * Keep at least the prefix when the message must be truncated. * It is a rather theoretical problem when someone tries to * use a minimalist buffer. */ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) return; if (pmsg->outbuf_len + len >= outbuf_sz) { /* Truncate the message, but keep it terminated. */ pmsg->outbuf_len = outbuf_sz - (len + 1); outbuf[pmsg->outbuf_len] = 0; } memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. * * @dropped is the dropped count to report in the dropped message. */ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); } /* * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. */ void console_prepend_replay(struct printk_message *pmsg) { console_prepend_message(pmsg, "** replaying previous printk message **\n"); } /* * Read and format the specified record (or a later record if the specified * record is not available). * * @pmsg will contain the formatted result. @pmsg->pbufs must point to a * struct printk_buffers. * * @seq is the record to read and format. If it is not available, the next * valid record is read. * * @is_extended specifies if the message should be formatted for extended * console output. * * @may_supress specifies if records may be skipped based on loglevel. * * Returns false if no record is available. Otherwise true and all fields * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; struct printk_info info; struct printk_record r; size_t len = 0; bool force_con; /* * Formatting extended messages requires a separate buffer, so use the * scratch buffer to read in the ringbuffer text. * * Formatting normal messages is done in-place, so read the ringbuffer * text directly into the output buffer. */ if (is_extended) prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); else prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); if (!prb_read_valid(prb, seq, &r)) return false; pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; /* * Skip records that are not forced to be printed on consoles and that * has level above the console loglevel. */ if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { len = info_print_ext_header(outbuf, outbuf_sz, r.info); len += msg_print_ext_body(outbuf + len, outbuf_sz - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } out: pmsg->outbuf_len = len; return true; } /* * Legacy console printing from printk() caller context does not respect * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a * false positive. For PREEMPT_RT the false positive condition does not * occur. * * This map is used to temporarily establish LD_WAIT_SLEEP context for the * console write() callback when legacy printing to avoid false positive * lockdep complaints, thus allowing lockdep to continue to function for * real issues. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); static inline void printk_legacy_allow_spinlock_enter(void) { lock_map_acquire_try(&printk_legacy_map); } static inline void printk_legacy_allow_spinlock_exit(void) { lock_map_release(&printk_legacy_map); } #endif /* CONFIG_PREEMPT_RT */ /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. */ struct printk_buffers printk_shared_pbufs; /* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the * console_lock and the SRCU read lock. Otherwise it is set to false. * * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &printk_shared_pbufs, }; unsigned long flags; *handover = false; if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; con->dropped += pmsg.dropped; /* Skip messages of formatted length 0. */ if (pmsg.outbuf_len == 0) { con->seq = pmsg.seq + 1; goto skip; } if (con->dropped && !is_extended) { console_prepend_dropped(&pmsg, con->dropped); con->dropped = 0; } /* Write everything out to the hardware. */ if (force_legacy_kthread() && !panic_in_progress()) { /* * With forced threading this function is in a task context * (either legacy kthread or get_init_console_seq()). There * is no need for concern about printk reentrance, handovers, * or lockdep complaints. */ con->write(con, outbuf, pmsg.outbuf_len); con->seq = pmsg.seq + 1; } else { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); /* Do not trace print latency. */ stop_critical_timings(); printk_legacy_allow_spinlock_enter(); con->write(con, outbuf, pmsg.outbuf_len); printk_legacy_allow_spinlock_exit(); start_critical_timings(); con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } skip: return true; } #else static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { *handover = false; return false; } static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ /* * Print out all remaining records to all consoles. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. * The value is valid only when this function returns true. It means that all * usable consoles are completely flushed. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * * Returns true when there was at least one usable console and all messages * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this * is not the panic CPU). Regardless the reason, the caller should assume it * is not useful to immediately try again. * * Requires the console_lock. */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; int cookie; *next_seq = 0; *handover = false; do { any_progress = false; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; bool progress; /* * console_flush_all() is only responsible for nbcon * consoles when the nbcon consoles cannot print via * their atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; if (flags & CON_NBCON) { progress = nbcon_legacy_emit_next_record(con, handover, cookie, !do_cond_resched); printk_seq = nbcon_seq_read(con); } else { progress = console_emit_next_record(con, handover, cookie); printk_seq = con->seq; } /* * If a handover has occurred, the SRCU read lock * is already released. */ if (*handover) return false; /* Track the next of the highest seq flushed. */ if (printk_seq > *next_seq) *next_seq = printk_seq; if (!progress) continue; any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) cond_resched(); } console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; abandon: console_srcu_read_unlock(cookie); return false; } static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. Therefore, create * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; do { console_may_schedule = 0; flushed = console_flush_all(do_cond_resched, &next_seq, &handover); if (!handover) __console_unlock(); /* * Abort if there was a failure to flush all messages to all * usable consoles. Either it is not possible to flush (in * which case it would be an infinite loop of retrying) or * another context has taken over printing. */ if (!flushed) break; /* * Some context may have added new records after * console_flush_all() but before unlocking the console. * Re-check if there is a new record to flush. If the trylock * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } /** * console_unlock - unblock the legacy console subsystem from printing * * Releases the console_lock which the caller holds to block printing of * the legacy console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock() emits the output on * legacy consoles prior to releasing the lock. * * console_unlock(); may be called from any context. */ void console_unlock(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.legacy_direct) __console_flush_and_unlock(); else __console_unlock(); } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { bool found_unblank = false; struct console *c; int cookie; /* * First check if there are any consoles implementing the unblank() * callback. If not, there is no reason to continue and take the * console lock, which in particular can be dangerous if * @oops_in_progress is set. */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { found_unblank = true; break; } } console_srcu_read_unlock(cookie); if (!found_unblank) return; /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * * If @oops_in_progress is set, this may be an atomic context. * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { /* Semaphores are not NMI-safe. */ if (in_nmi()) return; /* * Attempting to trylock the console lock can deadlock * if another CPU was stopped while modifying the * semaphore. "Hope and pray" that this is not the * current situation. */ if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); console_unlock(); if (!oops_in_progress) pr_flush(1000, true); } /* * Rewind all consoles to the oldest available record. * * IMPORTANT: The function is safe only when called under * console_lock(). It is not enforced because * it is used as a best effort in panic(). */ static void __console_rewind_all(void) { struct console *c; short flags; int cookie; u64 seq; seq = prb_first_valid_seq(prb); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { flags = console_srcu_read_flags(c); if (flags & CON_NBCON) { nbcon_seq_force(c, seq); } else { /* * This assignment is safe only when called under * console_lock(). On panic, legacy consoles are * only best effort. */ c->seq = seq; } } console_srcu_read_unlock(cookie); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { struct console_flush_type ft; bool handover; u64 next_seq; /* * Ignore the console lock and flush out the messages. Attempting a * trylock would not be useful because: * * - if it is contended, it must be ignored anyway * - console_lock() and console_trylock() block and fail * respectively in panic for non-panic CPUs * - semaphores are not NMI-safe */ /* * If another context is holding the console lock, * @console_may_schedule might be set. Clear it so that * this context does not call cond_resched() while flushing. */ console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); /* Flush legacy consoles once allowed, even when dangerous. */ if (legacy_allow_panic_sync) console_flush_all(false, &next_seq, &handover); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; int cookie; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_srcu_read_unlock(cookie); console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can disable console output before suspending a port, and can * re-enable output afterwards. */ void console_stop(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); console_srcu_write_flags(console, console->flags & ~CON_ENABLED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All contexts must * be able to see that this console is disabled so that (for example) * the caller can suspend the port without risk of another context * using the port. */ synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { struct console_flush_type ft; bool is_nbcon; console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); is_nbcon = console->flags & CON_NBCON; console_list_unlock(); /* * Ensure that all SRCU list walks have completed. The related * printing context must be able to see it is enabled so that * it is guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (is_nbcon && ft.nbcon_offload) nbcon_kthread_wake(console); else if (ft.legacy_offload) defer_console_output(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ static bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; static bool legacy_kthread_should_wakeup(void) { struct console_flush_type ft; struct console *con; bool ret = false; int cookie; if (kthread_should_stop()) return true; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; /* * The legacy printer thread is only responsible for nbcon * consoles when the nbcon consoles cannot print via their * atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, false)) continue; if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(con); } else { /* * It is safe to read @seq because only this * thread context updates @seq. */ printk_seq = con->seq; } if (prb_read_valid(prb, printk_seq, NULL)) { ret = true; break; } } console_srcu_read_unlock(cookie); return ret; } static int legacy_kthread_func(void *unused) { for (;;) { wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); if (kthread_should_stop()) break; console_lock(); __console_flush_and_unlock(); } return 0; } static bool legacy_kthread_create(void) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); if (WARN_ON(IS_ERR(kt))) { pr_err("failed to start legacy printing thread\n"); return false; } printk_legacy_kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(printk_legacy_kthread, -20); return true; } /** * printk_kthreads_shutdown - shutdown all threaded printers * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ static void printk_kthreads_shutdown(void) { struct console *con; console_list_lock(); if (printk_kthreads_running) { printk_kthreads_running = false; for_each_console(con) { if (con->flags & CON_NBCON) nbcon_kthread_stop(con); } /* * The threads may have been stopped while printing a * backlog. Flush any records left over. */ nbcon_atomic_flush_pending(); } console_list_unlock(); } static struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. * * Must be called under console_list_lock(). */ static void printk_kthreads_check_locked(void) { struct hlist_node *tmp; struct console *con; lockdep_assert_console_list_lock_held(); if (!printk_kthreads_ready) return; if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && !legacy_kthread_create()) { /* * All legacy consoles must be unregistered. If there * are any nbcon consoles, they will set up their own * kthread. */ hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_NBCON) continue; unregister_console_locked(con); } } } else if (printk_legacy_kthread) { kthread_stop(printk_legacy_kthread); printk_legacy_kthread = NULL; } /* * Printer threads cannot be started as long as any boot console is * registered because there is no way to synchronize the hardware * registers between boot console code and regular console code. * It can only be known that there will be no new boot consoles when * an nbcon console is registered. */ if (have_boot_console || !have_nbcon_console) { /* Clear flag in case all nbcon consoles unregistered. */ printk_kthreads_running = false; return; } if (printk_kthreads_running) return; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_NBCON)) continue; if (!nbcon_kthread_create(con)) unregister_console_locked(con); } printk_kthreads_running = true; } static int __init printk_set_kthreads_ready(void) { register_syscore_ops(&printk_syscore_ops); console_list_lock(); printk_kthreads_ready = true; printk_kthreads_check_locked(); console_list_unlock(); return 0; } early_initcall(printk_set_kthreads_ready); #endif /* CONFIG_PRINTK */ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); static int console_call_setup(struct console *newcon, char *options) { int err; if (!newcon->setup) return 0; /* Synchronize with possible boot console. */ console_lock(); err = newcon->setup(newcon, options); console_unlock(); return err; } /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_preferred_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { /* Console not yet initialized? */ if (!c->name[0]) continue; if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; err = console_call_setup(newcon, c->options); if (err) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) newcon->flags |= CON_CONSDEV; return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* Try to enable the console unconditionally */ static void try_enable_default_console(struct console *newcon) { if (newcon->index < 0) newcon->index = 0; if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; if (newcon->device) newcon->flags |= CON_CONSDEV; } /* Return the starting sequence number for a newly registered console. */ static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered * shortly, some may not be caught up and may be the same * device as @newcon. Since it is not known which boot console * is the same device, flush all consoles and, if necessary, * start with the message of the enabled boot console that is * the furthest behind. */ if (bootcon_registered && !keep_bootcon) { /* * Hold the console_lock to stop console printing and * guarantee safe access to console->seq. */ console_lock(); /* * Flush all consoles and set the console to start at * the next unprinted sequence number. */ if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. */ /* * If there was a handover, this context no * longer holds the console_lock. */ if (handover) console_lock(); init_seq = prb_next_seq(prb); for_each_console(con) { u64 seq; if (!(con->flags & CON_BOOT) || !(con->flags & CON_ENABLED)) { continue; } if (con->flags & CON_NBCON) seq = nbcon_seq_read(con); else seq = con->seq; if (seq < init_seq) init_seq = seq; } } console_unlock(); } } return init_seq; } #define console_first() \ hlist_entry(console_list.first, struct console, node) static int unregister_console_locked(struct console *console); /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; struct console *con; unsigned long flags; u64 init_seq; int err; console_list_lock(); for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", con->name, con->index)) { goto unlock; } if (con->flags & CON_BOOT) bootcon_registered = true; else realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); goto unlock; } if (newcon->flags & CON_NBCON) { /* * Ensure the nbcon console buffers can be allocated * before modifying any global data. */ if (!nbcon_alloc(newcon)) goto unlock; } /* * See if we want to enable this console driver by default. * * Nope when a console is preferred by the command line, device * tree, or SPCR. * * The first real console with tty binding (driver) wins. More * consoles might get enabled before the right one is found. * * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) { if (newcon->flags & CON_NBCON) nbcon_free(newcon); goto unlock; } /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } newcon->dropped = 0; init_seq = get_init_console_seq(newcon, bootcon_registered); if (newcon->flags & CON_NBCON) { have_nbcon_console = true; nbcon_seq_force(newcon, init_seq); } else { have_legacy_console = true; newcon->seq = init_seq; } if (newcon->flags & CON_BOOT) have_boot_console = true; /* * If another context is actively using the hardware of this new * console, it will not be aware of the nbcon synchronization. This * is a risk that two contexts could access the hardware * simultaneously if this new console is used for atomic printing * and the other context is still using the hardware. * * Use the driver synchronization to ensure that the hardware is not * in use while this new console transitions to being registered. */ if (use_device_lock) newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the * preferred driver at the head of the list. */ if (hlist_empty(&console_list)) { /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; hlist_add_head_rcu(&newcon->node, &console_list); } else if (newcon->flags & CON_CONSDEV) { /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); hlist_add_head_rcu(&newcon->node, &console_list); } else { hlist_add_behind_rcu(&newcon->node, console_list.first); } /* * No need to synchronize SRCU here! The caller does not rely * on all contexts being able to see the new console before * register_console() completes. */ /* This new console is now registered. */ if (use_device_lock) newcon->device_unlock(newcon, flags); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { struct hlist_node *tmp; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) unregister_console_locked(con); } } /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); unlock: console_list_unlock(); } EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; bool found_legacy_con = false; bool found_nbcon_con = false; bool found_boot_con = false; unsigned long flags; struct console *c; int res; lockdep_assert_console_list_lock_held(); con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; if (!console_is_registered_locked(console)) res = -ENODEV; else if (console_is_usable(console, console->flags, true)) __pr_flush(console, 1000, true); /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); if (res < 0) return res; /* * Use the driver synchronization to ensure that the hardware is not * in use while this console transitions to being unregistered. */ if (use_device_lock) console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); if (use_device_lock) console->device_unlock(console, flags); /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. * </HISTORICAL> * * The above makes no sense as there is no guarantee that the next * console has any device attached. Oh well.... */ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); /* * Ensure that all SRCU list walks have completed. All contexts * must not be able to see this console in the list so that any * exit/cleanup routines can be performed safely. */ synchronize_srcu(&console_srcu); if (console->flags & CON_NBCON) nbcon_free(console); console_sysfs_notify(); if (console->exit) res = console->exit(console); /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. */ for_each_console(c) { if (c->flags & CON_BOOT) found_boot_con = true; if (c->flags & CON_NBCON) found_nbcon_con = true; else found_legacy_con = true; } if (!found_boot_con) have_boot_console = found_boot_con; if (!found_legacy_con) have_legacy_console = found_legacy_con; if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); return res; } int unregister_console(struct console *console) { int res; console_list_lock(); res = unregister_console_locked(console); console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /** * console_force_preferred_locked - force a registered console preferred * @con: The registered console to force preferred. * * Must be called under console_list_lock(). */ void console_force_preferred_locked(struct console *con) { struct console *cur_pref_con; if (!console_is_registered_locked(con)) return; cur_pref_con = console_first(); /* Already preferred? */ if (cur_pref_con == con) return; /* * Delete, but do not re-initialize the entry. This allows the console * to continue to appear registered (via any hlist_unhashed_lockless() * checks), even though it was briefly removed from the console list. */ hlist_del_rcu(&con->node); /* * Ensure that all SRCU list walks have completed so that the console * can be added to the beginning of the console list and its forward * list pointer can be re-initialized. */ synchronize_srcu(&console_srcu); con->flags |= CON_CONSDEV; WARN_ON(!con->device); /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); hlist_add_head_rcu(&con->node, &console_list); } EXPORT_SYMBOL(console_force_preferred_locked); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct hlist_node *tmp; struct console *con; int ret; console_list_lock(); hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console_locked(con); } } console_list_unlock(); ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); printk_sysctl_init(); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; short flags; int cookie; u64 diff; u64 seq; /* Sorry, pr_flush() will not work this early. */ if (system_state < SYSTEM_SCHEDULING) return false; might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { console_lock(); console_unlock(); } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; diff = 0; /* * Hold the console_lock to guarantee safe access to * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. * * Holding the console_lock is not necessary if there * are no legacy or boot consoles. However, such a * console could register at any time. Always hold the * console_lock as a precaution rather than * synchronizing against register_console(). */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (con && con != c) continue; flags = console_srcu_read_flags(c); /* * If consoles are not usable, it cannot be expected * that they make forward progress, so only increment * @diff for usable consoles. */ if (!console_is_usable(c, flags, true) && !console_is_usable(c, flags, false)) { continue; } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { printk_seq = c->seq; } if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) break; /* msleep(1) might sleep much longer. Check time by jiffies. */ begin_jiffies = jiffies; msleep(1); slept_jiffies = jiffies - begin_jiffies; remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } return (diff == 0); } /** * pr_flush() - Wait for printing threads to catch up. * * @timeout_ms: The maximum time (in ms) to wait. * @reset_on_progress: Reset the timeout if forward progress is seen. * * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 * represents infinite waiting. * * If @reset_on_progress is true, the timeout will be reset whenever any * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ static bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { if (force_legacy_kthread()) { if (printk_legacy_kthread) wake_up_interruptible(&legacy_wait); } else { if (console_trylock()) console_unlock(); } } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } /** * wake_up_klogd - Wake kernel logging daemon * * Use this function when new records have been added to the ringbuffer * and the console printing of those records has already occurred or is * known to be handled by some other context. This function will only * wake the logging daemon. * * Context: Any context. */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } /** * defer_console_output - Wake kernel logging daemon and trigger * console printing in a deferred context * * Use this function when new records have been added to the ringbuffer, * this context is responsible for console printing those records, but * the current context is not allowed to perform the console printing. * Trigger an irq_work context to perform the console printing. This * function also wakes the logging daemon. * * Context: Any context. */ void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * @desc: a short string to describe what caused the panic or oops. Can be NULL * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; struct kmsg_dump_detail detail = { .reason = reason, .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, &detail); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * * Context: Any, except for NMI. */ void console_try_replay_all(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } } #endif #ifdef CONFIG_SMP static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); bool is_printk_cpu_sync_owner(void) { return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); } /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_cpu_sync_wait(void) { do { cpu_relax(); } while (atomic_read(&printk_cpu_sync_owner) != -1); } EXPORT_SYMBOL(__printk_cpu_sync_wait); /** * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_sync_try_get(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_sync_put:B. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_put:A can never read from * __printk_cpu_sync_try_get:B. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_sync_put(void) { if (atomic_read(&printk_cpu_sync_nested)) { atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of the next CPU */ atomic_set_release(&printk_cpu_sync_owner, -1); /* LMM(__printk_cpu_sync_put:B) */ } EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */
80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015 Linaro Ltd. * Author: Shannon Zhao <shannon.zhao@linaro.org> */ #ifndef __ASM_ARM_KVM_PMU_H #define __ASM_ARM_KVM_PMU_H #include <linux/perf_event.h> #include <linux/perf/arm_pmuv3.h> #define KVM_ARMV8_PMU_MAX_COUNTERS 32 #if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM) struct kvm_pmc { u8 idx; /* index into the pmu->pmc array */ struct perf_event *perf_event; }; struct kvm_pmu_events { u64 events_host; u64 events_guest; }; struct kvm_pmu { struct irq_work overflow_work; struct kvm_pmu_events events; struct kvm_pmc pmc[KVM_ARMV8_PMU_MAX_COUNTERS]; int irq_num; bool created; bool irq_level; }; struct arm_pmu_entry { struct list_head entry; struct arm_pmu *arm_pmu; }; DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); static __always_inline bool kvm_arm_support_pmu_v3(void) { return static_branch_likely(&kvm_arm_pmu_available); } #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); void kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx); void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu); int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); struct kvm_pmu_events *kvm_get_pmu_events(void); void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); #define kvm_vcpu_has_pmu(vcpu) \ (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3)) /* * Updates the vcpu's view of the pmu events for this cpu. * Must be called before every vcpu run after disabling interrupts, to ensure * that an interrupt cannot fire and update the structure. */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ if (!has_vhe() && kvm_arm_support_pmu_v3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) u8 kvm_arm_pmu_get_pmuver_limit(void); u64 kvm_pmu_evtyper_mask(struct kvm *kvm); int kvm_arm_set_default_pmu(struct kvm *kvm); u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx); void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu); #else struct kvm_pmu { }; static inline bool kvm_arm_support_pmu_v3(void) { return false; } #define kvm_arm_pmu_irq_initialized(v) (false) static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { return 0; } static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { return 0; } static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) { return 0; } static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { return false; } static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) {} static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { return -ENXIO; } static inline int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { return -ENXIO; } static inline int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { return -ENXIO; } static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { return 0; } static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { return 0; } #define kvm_vcpu_has_pmu(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) {} static inline u8 kvm_arm_pmu_get_pmuver_limit(void) { return 0; } static inline u64 kvm_pmu_evtyper_mask(struct kvm *kvm) { return 0; } static inline void kvm_vcpu_pmu_resync_el0(void) {} static inline int kvm_arm_set_default_pmu(struct kvm *kvm) { return -ENODEV; } static inline u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { return 0; } static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { return 0; } static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) { return false; } static inline void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) {} #endif #endif
133 218 677 2 31 119 241 49 233 28 173 160 68 240 240 35 24 185 392 258 25 113 182 642 184 185 185 184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, /* Anonymous memory (and shmem) */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Some filesystems */ PG_checked = PG_owner_priv_1, /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped * THP), PG_anon_exclusive may be set only for the head page or for * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ PG_anon_exclusive = PG_owner_2, /* * Set if all buffer heads in the folio are mapped. * Filesystems which do not use BHs can use it for their own purpose. */ PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, /* non-lru isolated movable page */ PG_isolated = PG_reclaim, /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, #ifdef CONFIG_MEMORY_HOTPLUG /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif /* * Flags only valid for compound pages. Stored in first tail page's * flags word. Cannot use the first 8 flags or any flag marked as * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * Return the real head page struct iff the @page is a fake head page, otherwise * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && test_bit(PG_head, &page->flags)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least * two contiguous pages. */ unsigned long head = READ_ONCE(page[1].compound_head); if (likely(head & 1)) return (const struct page *)(head - 1); } return page; } #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } #endif static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) /** * page_folio - Converts from page to folio. * @p: The page. * * Every page is part of a folio. This function cannot be called on a * NULL pointer. * * Context: No reference, nor lock is required on @page. If the caller * does not hold a reference, this call may race with a folio split, so * it should re-check the folio still contains this page after gaining * a reference on the folio. * Return: The folio which contains this page. */ #define page_folio(p) (_Generic((p), \ const struct page *: (const struct folio *)_compound_head(p), \ struct page *: (struct folio *)_compound_head(p))) /** * folio_page - Return a page from a folio. * @folio: The folio. * @n: The page number to return. * * @n is relative to the start of the folio. This function does not * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ #define folio_page(folio, n) nth_page(&(folio)->page, n) static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif static const unsigned long *const_folio_flags(const struct folio *folio, unsigned n) { const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 #define FOLIO_HEAD_PAGE 0 #define FOLIO_SECOND_PAGE 1 /* * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ static __always_inline bool folio_test_##name(const struct folio *folio) \ { return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ { set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void folio_clear_##name(struct folio *folio) \ { clear_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_SET_FLAG(name, page) \ static __always_inline void __folio_set_##name(struct folio *folio) \ { __set_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void __folio_clear_##name(struct folio *folio) \ { __clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_SET_FLAG(name, page) \ static __always_inline bool folio_test_set_##name(struct folio *folio) \ { return test_and_set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_CLEAR_FLAG(name, page) \ static __always_inline bool folio_test_clear_##name(struct folio *folio) \ { return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_FLAG(name, page) \ FOLIO_TEST_FLAG(name, page) \ FOLIO_SET_FLAG(name, page) \ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define FOLIO_TEST_FLAG_FALSE(name) \ static inline bool folio_test_##name(const struct folio *folio) \ { return false; } #define FOLIO_SET_FLAG_NOOP(name) \ static inline void folio_set_##name(struct folio *folio) { } #define FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void folio_clear_##name(struct folio *folio) { } #define __FOLIO_SET_FLAG_NOOP(name) \ static inline void __folio_set_##name(struct folio *folio) { } #define __FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void __folio_clear_##name(struct folio *folio) { } #define FOLIO_TEST_SET_FLAG_FALSE(name) \ static inline bool folio_test_set_##name(struct folio *folio) \ { return false; } #define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ static inline bool folio_test_clear_##name(struct folio *folio) \ { return false; } #define FOLIO_FLAG_FALSE(name) \ FOLIO_TEST_FLAG_FALSE(name) \ FOLIO_SET_FLAG_NOOP(name) \ FOLIO_CLEAR_FLAG_NOOP(name) #define TESTPAGEFLAG_FALSE(uname, lname) \ FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ __FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) #define TESTSCFLAG_FALSE(uname, lname) \ TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) FOLIO_FLAG(active, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE) /* owner_2 can be set on tail pages for anon memory */ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && test_bit(PG_swapcache, const_folio_flags(folio, 0)); } FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(swapcache) #endif FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(mlocked) __FOLIO_CLEAR_FLAG_NOOP(mlocked) FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif #ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* See page_idle.h for !64BIT workaround */ #else /* !CONFIG_PAGE_IDLE_FLAG */ FOLIO_FLAG_FALSE(young) FOLIO_TEST_CLEAR_FLAG_FALSE(young) FOLIO_FLAG_FALSE(idle) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #ifdef CONFIG_MEMORY_HOTPLUG PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) #else PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* * On an anonymous folio mapped into a user virtual memory area, * folio->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable * page and then folio->mapping points to a struct movable_operations. * * Please note that, confusingly, "folio_mapping" refers to the inode * address_space which maps the folio from disk; whereas "folio_mapped" * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its * internal states, the folio->mapping does not exist as such, nor do * these flags below. So in order to avoid testing non-existent bits, * please make sure that folio_test_slab(folio) actually evaluates to * false before calling the following functions (e.g., folio_test_anon). * See mm/slab.h. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) /* * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ #define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; } static __always_inline bool PageMappingFlags(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON; } static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } static __always_inline bool __folio_test_movable(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; } static __always_inline bool __PageMovable(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } #else FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. * @folio: The folio. * @mask: Bits set in this word will be changed. * * This must only be used for flags which are changed with the folio * lock held. For example, it is unsafe to use for PG_dirty as that * can be set without the folio lock held. It can also only be used * on flags which are in the range 0-6 as some of the implementations * only affect those bits. * * Return: Whether there are tasks waiting on the folio. */ static inline bool folio_xor_flags_has_waiters(struct folio *folio, unsigned long mask) { return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); } /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. * * The uptodate flag is set on a folio when every byte in the folio is * at least as new as the corresponding bytes on storage. Anonymous * and CoW folios are always uptodate. If the folio is not uptodate, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ static inline bool folio_test_uptodate(const struct folio *folio) { bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static inline bool PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } static __always_inline void __folio_mark_uptodate(struct folio *folio) { smp_wmb(); __set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void folio_mark_uptodate(struct folio *folio) { /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the folio * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void __SetPageUptodate(struct page *page) { __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) void __folio_start_writeback(struct folio *folio, bool keep_write); void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) static __always_inline bool folio_test_head(const struct folio *folio) { return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) __CLEARPAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. * * Return: True if the folio is larger than one page. */ static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for * normal or transparent huge pages. * * PageTransHuge() returns true for both transparent huge and * hugetlbfs pages, but not normal pages. PageTransHuge() can only be * called only in the core VM paths where hugetlbfs pages can't exist. */ static inline int PageTransHuge(const struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); return PageHead(page); } /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the * compound page. * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* * For pages that do not use mapcount, page_type may be used. * The low 24 bits of pagetype may be used for your own purposes, as long * as you are careful to not affect the top 8 bits. The low bits of * pagetype will be overwritten when you clear the page_type from the page. */ enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ PGTY_buddy = 0xf0, PGTY_offline = 0xf1, PGTY_table = 0xf2, PGTY_guard = 0xf3, PGTY_hugetlb = 0xf4, PGTY_slab = 0xf5, PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_mapcount_underflow = 0xff }; static inline bool page_type_has_type(int page_type) { return page_type < (PGTY_mapcount_underflow << 24); } /* This takes a mapcount which is one more than page->_mapcount */ static inline bool page_mapcount_is_type(unsigned int mapcount) { return page_type_has_type(mapcount - 1); } static inline bool page_has_type(const struct page *page) { return page_mapcount_is_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ if (folio_test_##fname(folio)) \ return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ if (folio->page.page_type == UINT_MAX) \ return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ if (Page##uname(page)) \ return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ if (page->page_type == UINT_MAX) \ return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * When a memory block gets onlined, all pages are initialized with a * refcount of 1 and PageOffline(). generic_online_page() will * take care of clearing PageOffline(). * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free (unmanaged) * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * * Memory offlining code will not adjust the managed page count for any * PageOffline() pages, treating them like they were never exposed to the * buddy using generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); extern void page_offline_begin(void); extern void page_offline_end(void); /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard, guard) FOLIO_TYPE_OPS(slab, slab) /** * PageSlab - Determine if the page belongs to the slab allocator * @page: The page to test. * * Context: Any context. * Return: True for slab pages, false for any other kind of page. */ static inline bool PageSlab(const struct page *page) { return folio_test_slab(page_folio(page)); } #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) #else FOLIO_TEST_FLAG_FALSE(hugetlb) #endif PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) /* * Mark pages that has to be accepted before touched for the first time. * * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. * * Context: Any context. * Return: True for hugetlbfs pages, false for anon pages or pages * belonging to other filesystems. */ static inline bool PageHuge(const struct page *page) { return folio_test_hugetlb(page_folio(page)); } /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with * failing hardware. */ static inline bool is_page_hwpoison(const struct page *page) { const struct folio *folio; if (PageHWPoison(page)) return true; folio = page_folio(page); return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } bool is_free_buddy_page(const struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); /* * HugeTLB stores this information on the head page; THP keeps it per * page */ if (PageHuge(page)) page = compound_head(page); return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) /* * Flags stored in the second page of a compound page. They may overlap * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * folio_has_private - Determine if folio has private stuff * @folio: The folio to be checked * * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ static inline int folio_has_private(const struct folio *folio) { return !!(folio->flags & PAGE_FLAGS_PRIVATE); } #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */
23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_highmem(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } static inline void unmap_and_put_page(struct page *page, void *addr) { folio_release_kmap(page_folio(page), addr); } #endif /* _LINUX_HIGHMEM_H */
238 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2020 - Google LLC * Author: Quentin Perret <qperret@google.com> */ #include <linux/init.h> #include <linux/kmemleak.h> #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> #include <linux/memblock.h> #include <linux/mutex.h> #include <linux/sort.h> #include <asm/kvm_pkvm.h> #include "hyp_constants.h" DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); phys_addr_t hyp_mem_base; phys_addr_t hyp_mem_size; static int cmp_hyp_memblock(const void *p1, const void *p2) { const struct memblock_region *r1 = p1; const struct memblock_region *r2 = p2; return r1->base < r2->base ? -1 : (r1->base > r2->base); } static void __init sort_memblock_regions(void) { sort(hyp_memory, *hyp_memblock_nr_ptr, sizeof(struct memblock_region), cmp_hyp_memblock, NULL); } static int __init register_memblock_regions(void) { struct memblock_region *reg; for_each_mem_region(reg) { if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) return -ENOMEM; hyp_memory[*hyp_memblock_nr_ptr] = *reg; (*hyp_memblock_nr_ptr)++; } sort_memblock_regions(); return 0; } void __init kvm_hyp_reserve(void) { u64 hyp_mem_pages = 0; int ret; if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) return; if (kvm_get_mode() != KVM_MODE_PROTECTED) return; ret = register_memblock_regions(); if (ret) { *hyp_memblock_nr_ptr = 0; kvm_err("Failed to register hyp memblocks: %d\n", ret); return; } hyp_mem_pages += hyp_s1_pgtable_pages(); hyp_mem_pages += host_s2_pgtable_pages(); hyp_mem_pages += hyp_vm_table_pages(); hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); hyp_mem_pages += hyp_ffa_proxy_pages(); /* * Try to allocate a PMD-aligned region to reduce TLB pressure once * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. */ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), PMD_SIZE); if (!hyp_mem_base) hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); else hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); if (!hyp_mem_base) { kvm_err("Failed to reserve hyp memory\n"); return; } kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); } static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) { if (host_kvm->arch.pkvm.handle) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, host_kvm->arch.pkvm.handle)); } host_kvm->arch.pkvm.handle = 0; free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); } /* * Allocates and donates memory for hypervisor VM structs at EL2. * * Allocates space for the VM state, which includes the hyp vm as well as * the hyp vcpus. * * Stores an opaque handler in the kvm struct for future reference. * * Return 0 on success, negative error code on failure. */ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) { size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz; struct kvm_vcpu *host_vcpu; pkvm_handle_t handle; void *pgd, *hyp_vm; unsigned long idx; int ret; if (host_kvm->created_vcpus < 1) return -EINVAL; pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr); /* * The PGD pages will be reclaimed using a hyp_memcache which implies * page granularity. So, use alloc_pages_exact() to get individual * refcounts. */ pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); if (!pgd) return -ENOMEM; /* Allocate memory to donate to hyp for vm and vcpu pointers. */ hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, size_mul(sizeof(void *), host_kvm->created_vcpus))); hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vm) { ret = -ENOMEM; goto free_pgd; } /* Donate the VM memory to hyp and let hyp initialize it. */ ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); if (ret < 0) goto free_vm; handle = ret; host_kvm->arch.pkvm.handle = handle; /* Donate memory for the vcpus at hyp and initialize it. */ hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { void *hyp_vcpu; /* Indexing of the vcpus to be sequential starting at 0. */ if (WARN_ON(host_vcpu->vcpu_idx != idx)) { ret = -EINVAL; goto destroy_vm; } hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vcpu) { ret = -ENOMEM; goto destroy_vm; } ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, hyp_vcpu); if (ret) { free_pages_exact(hyp_vcpu, hyp_vcpu_sz); goto destroy_vm; } } return 0; destroy_vm: __pkvm_destroy_hyp_vm(host_kvm); return ret; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); free_pgd: free_pages_exact(pgd, pgd_sz); return ret; } int pkvm_create_hyp_vm(struct kvm *host_kvm) { int ret = 0; mutex_lock(&host_kvm->arch.config_lock); if (!host_kvm->arch.pkvm.handle) ret = __pkvm_create_hyp_vm(host_kvm); mutex_unlock(&host_kvm->arch.config_lock); return ret; } void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { mutex_lock(&host_kvm->arch.config_lock); __pkvm_destroy_hyp_vm(host_kvm); mutex_unlock(&host_kvm->arch.config_lock); } int pkvm_init_host_vm(struct kvm *host_kvm) { return 0; } static void __init _kvm_host_prot_finalize(void *arg) { int *err = arg; if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) WRITE_ONCE(*err, -EINVAL); } static int __init pkvm_drop_host_privileges(void) { int ret = 0; /* * Flip the static key upfront as that may no longer be possible * once the host stage 2 is installed. */ static_branch_enable(&kvm_protected_mode_initialized); on_each_cpu(_kvm_host_prot_finalize, &ret, 1); return ret; } static int __init finalize_pkvm(void) { int ret; if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised()) return 0; /* * Exclude HYP sections from kmemleak so that they don't get peeked * at, which would end badly once inaccessible. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); ret = pkvm_drop_host_privileges(); if (ret) pr_err("Failed to finalize Hyp protection: %d\n", ret); return ret; } device_initcall_sync(finalize_pkvm); static int cmp_mappings(struct rb_node *node, const struct rb_node *parent) { struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node); struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node); if (a->gfn < b->gfn) return -1; if (a->gfn > b->gfn) return 1; return 0; } static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn) { struct rb_node *node = root->rb_node, *prev = NULL; struct pkvm_mapping *mapping; while (node) { mapping = rb_entry(node, struct pkvm_mapping, node); if (mapping->gfn == gfn) return node; prev = node; node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right; } return prev; } /* * __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing * of __map inline. */ #define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \ ((__start) >> PAGE_SHIFT)); \ __tmp && ({ \ __map = rb_entry(__tmp, struct pkvm_mapping, node); \ __tmp = rb_next(__tmp); \ true; \ }); \ ) \ if (__map->gfn < ((__start) >> PAGE_SHIFT)) \ continue; \ else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \ break; \ else int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { pgt->pkvm_mappings = RB_ROOT; pgt->mmu = mmu; return 0; } void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; struct rb_node *node; if (!handle) return; node = rb_first(&pgt->pkvm_mappings); while (node) { mapping = rb_entry(node, struct pkvm_mapping, node); kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); node = rb_next(node); rb_erase(&mapping->node, &pgt->pkvm_mappings); kfree(mapping); } } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); struct pkvm_mapping *mapping = NULL; struct kvm_hyp_memcache *cache = mc; u64 gfn = addr >> PAGE_SHIFT; u64 pfn = phys >> PAGE_SHIFT; int ret; if (size != PAGE_SIZE) return -EINVAL; lockdep_assert_held_write(&kvm->mmu_lock); ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot); if (ret) { /* Is the gfn already mapped due to a racing vCPU? */ if (ret == -EPERM) return -EAGAIN; } swap(mapping, cache->mapping); mapping->gfn = gfn; mapping->pfn = pfn; WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings)); return ret; } int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; int ret = 0; lockdep_assert_held_write(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); if (WARN_ON(ret)) break; rb_erase(&mapping->node, &pgt->pkvm_mappings); kfree(mapping); } return ret; } int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; int ret = 0; lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn); if (WARN_ON(ret)) break; } return ret; } int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); struct pkvm_mapping *mapping; lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE); return 0; } bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; bool young = false; lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, mkold); return young; } int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); } void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); } void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { WARN_ON_ONCE(1); } kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { WARN_ON_ONCE(1); return NULL; } int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc) { WARN_ON_ONCE(1); return -EINVAL; }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 // SPDX-License-Identifier: GPL-2.0 /* * Hyp portion of the (not much of an) Emulation layer for 32bit guests. * * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * based on arch/arm/kvm/emulate.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> /* * stolen from arch/arm/kernel/opcodes.c * * condition code lookup table * index into the table is test code: EQ, NE, ... LT, GT, AL, NV * * bit position in short is condition code: NZCV */ static const unsigned short cc_map[16] = { 0xF0F0, /* EQ == Z set */ 0x0F0F, /* NE */ 0xCCCC, /* CS == C set */ 0x3333, /* CC */ 0xFF00, /* MI == N set */ 0x00FF, /* PL */ 0xAAAA, /* VS == V set */ 0x5555, /* VC */ 0x0C0C, /* HI == C set && Z clear */ 0xF3F3, /* LS == C clear || Z set */ 0xAA55, /* GE == (N==V) */ 0x55AA, /* LT == (N!=V) */ 0x0A05, /* GT == (!Z && (N==V)) */ 0xF5FA, /* LE == (Z || (N!=V)) */ 0xFFFF, /* AL always */ 0 /* NV */ }; /* * Check if a trapped instruction should have been executed or not. */ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) { unsigned long cpsr; u32 cpsr_cond; int cond; /* * These are the exception classes that could fire with a * conditional instruction. */ switch (kvm_vcpu_trap_get_class(vcpu)) { case ESR_ELx_EC_CP15_32: case ESR_ELx_EC_CP15_64: case ESR_ELx_EC_CP14_MR: case ESR_ELx_EC_CP14_LS: case ESR_ELx_EC_FP_ASIMD: case ESR_ELx_EC_CP10_ID: case ESR_ELx_EC_CP14_64: case ESR_ELx_EC_SVC32: break; default: return true; } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); if (cond == 0xE) return true; cpsr = *vcpu_cpsr(vcpu); if (cond < 0) { /* This can happen in Thumb mode: examine IT state. */ unsigned long it; it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); /* it == 0 => unconditional. */ if (it == 0) return true; /* The cond for this insn works out as the top 4 bits. */ cond = (it >> 4); } cpsr_cond = cpsr >> 28; if (!((cc_map[cond] >> cpsr_cond) & 1)) return false; return true; } /** * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have * to do this little bit of work manually. The fields map like this: * * IT[7:0] -> CPSR[26:25],CPSR[15:10] */ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) { unsigned long itbits, cond; unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_arm = !(cpsr & PSR_AA32_T_BIT); if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) return; cond = (cpsr & 0xe000) >> 13; itbits = (cpsr & 0x1c00) >> (10 - 2); itbits |= (cpsr & (0x3 << 25)) >> 25; /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ if ((itbits & 0x7) == 0) itbits = cond = 0; else itbits = (itbits << 1) & 0x1f; cpsr &= ~PSR_AA32_IT_MASK; cpsr |= cond << 13; cpsr |= (itbits & 0x1c) << (10 - 2); cpsr |= (itbits & 0x3) << 25; *vcpu_cpsr(vcpu) = cpsr; } /** * kvm_skip_instr32 - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ void kvm_skip_instr32(struct kvm_vcpu *vcpu) { u32 pc = *vcpu_pc(vcpu); bool is_thumb; is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); if (is_thumb && !kvm_vcpu_trap_il_is32bit(vcpu)) pc += 2; else pc += 4; *vcpu_pc(vcpu) = pc; kvm_adjust_itstate(vcpu); }
591 406 3 409 50 16 383 383 89 157 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif #define pmd_folio(pmd) page_folio(pmd_page(pmd)) /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef kernel_pte_init static inline void kernel_pte_init(void *addr) { } #define kernel_pte_init kernel_pte_init #endif #ifndef pmd_init static inline void pmd_init(void *addr) { } #define pmd_init pmd_init #endif #ifndef pud_init static inline void pud_init(void *addr) { } #define pud_init pud_init #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. This mode can only be entered and left under the protection of * the page table locks for all page tables which may be modified. In the UP * case, this is required so that preemption is disabled, and in the SMP case, * it must synchronize the delayed page table writes properly on other CPUs. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif #ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1. */ static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { return 1; } #endif #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef arch_check_zapped_pud static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, pte); return pte; } #endif #ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { if (flags == CYDP_CLEAR_YOUNG) ptep_test_and_clear_young(vma, addr, ptep); else { pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, addr, ptep, pte); } if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = ptep_get_and_clear_full(mm, addr, ptep, full); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef update_mmu_tlb_range static inline void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { } #endif static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { update_mmu_tlb_range(vma, address, ptep, 1); } /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD. */ static inline void clear_not_present_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { for (;;) { ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { } #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { for (int i = 0; i < nr; i++) { arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, pte_advance_pfn(pte, i), pte_advance_pfn(oldpte, i)); } } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_PGD_OFFSET_GATE #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ /* * track_pfn_remap is called when a _new_ pfn mapping is being established * by remap_pfn_range() for physical range indicated by pfn and size. */ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { return 0; } /* * track_pfn_insert is called when a _new_ single pfn is established * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). */ static inline int track_pfn_copy(struct vm_area_struct *vma) { return 0; } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { } /* * untrack_pfn_clear is called while mremapping a pfnmap for a new region * or fails to copy pgtable during duplicate vm area. */ static inline void untrack_pfn_clear(struct vm_area_struct *vma) { } #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); extern void untrack_pfn_clear(struct vm_area_struct *vma); #endif #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() set of functions an in the generic * vmalloc/ioremap code to track at which page-table levels entries have been * modified. Based on that the code can better decide when vmalloc and ioremap * mapping changes need to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(), * pXd_devmap(), or hugetlb mappings). */ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif #define __pte_leaf_size(x,y) pte_leaf_size(y) #endif /* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible. */ #ifndef pud_pfn #define pud_pfn(x) 0 #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #ifndef pte_pgprot #define pte_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pmd_pgprot #define pmd_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pud_pgprot #define pud_pgprot(x) ((pgprot_t) {0}) #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(unsigned long vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */
44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SIGNAL_H #define _LINUX_SIGNAL_H #include <linux/bug.h> #include <linux/list.h> #include <linux/signal_types.h> #include <linux/string.h> struct task_struct; /* for sysctl */ extern int print_fatal_signals; static inline void copy_siginfo(kernel_siginfo_t *to, const kernel_siginfo_t *from) { memcpy(to, from, sizeof(*to)); } static inline void clear_siginfo(kernel_siginfo_t *info) { memset(info, 0, sizeof(*info)); } #define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo)) static inline void copy_siginfo_to_external(siginfo_t *to, const kernel_siginfo_t *from) { memcpy(to, from, sizeof(*from)); memset(((char *)to) + sizeof(struct kernel_siginfo), 0, SI_EXPANSION_SIZE); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from); int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from); enum siginfo_layout { SIL_KILL, SIL_TIMER, SIL_POLL, SIL_FAULT, SIL_FAULT_TRAPNO, SIL_FAULT_MCEERR, SIL_FAULT_BNDERR, SIL_FAULT_PKUERR, SIL_FAULT_PERF_EVENT, SIL_CHLD, SIL_RT, SIL_SYS, }; enum siginfo_layout siginfo_layout(unsigned sig, int si_code); /* * Define some primitives to manipulate sigset_t. */ #ifndef __HAVE_ARCH_SIG_BITOPS #include <linux/bitops.h> /* We don't use <linux/bitops.h> for these because there is no need to be atomic. */ static inline void sigaddset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] |= 1UL << sig; else set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } static inline void sigdelset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] &= ~(1UL << sig); else set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW)); } static inline int sigismember(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) return 1 & (set->sig[0] >> sig); else return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW)); } #endif /* __HAVE_ARCH_SIG_BITOPS */ static inline int sigisemptyset(sigset_t *set) { switch (_NSIG_WORDS) { case 4: return (set->sig[3] | set->sig[2] | set->sig[1] | set->sig[0]) == 0; case 2: return (set->sig[1] | set->sig[0]) == 0; case 1: return set->sig[0] == 0; default: BUILD_BUG(); return 0; } } static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) { switch (_NSIG_WORDS) { case 4: return (set1->sig[3] == set2->sig[3]) && (set1->sig[2] == set2->sig[2]) && (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 2: return (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 1: return set1->sig[0] == set2->sig[0]; } return 0; } #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS #define _SIG_SET_BINOP(name, op) \ static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \ { \ unsigned long a0, a1, a2, a3, b0, b1, b2, b3; \ \ switch (_NSIG_WORDS) { \ case 4: \ a3 = a->sig[3]; a2 = a->sig[2]; \ b3 = b->sig[3]; b2 = b->sig[2]; \ r->sig[3] = op(a3, b3); \ r->sig[2] = op(a2, b2); \ fallthrough; \ case 2: \ a1 = a->sig[1]; b1 = b->sig[1]; \ r->sig[1] = op(a1, b1); \ fallthrough; \ case 1: \ a0 = a->sig[0]; b0 = b->sig[0]; \ r->sig[0] = op(a0, b0); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_or(x,y) ((x) | (y)) _SIG_SET_BINOP(sigorsets, _sig_or) #define _sig_and(x,y) ((x) & (y)) _SIG_SET_BINOP(sigandsets, _sig_and) #define _sig_andn(x,y) ((x) & ~(y)) _SIG_SET_BINOP(sigandnsets, _sig_andn) #undef _SIG_SET_BINOP #undef _sig_or #undef _sig_and #undef _sig_andn #define _SIG_SET_OP(name, op) \ static inline void name(sigset_t *set) \ { \ switch (_NSIG_WORDS) { \ case 4: set->sig[3] = op(set->sig[3]); \ set->sig[2] = op(set->sig[2]); \ fallthrough; \ case 2: set->sig[1] = op(set->sig[1]); \ fallthrough; \ case 1: set->sig[0] = op(set->sig[0]); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_not(x) (~(x)) _SIG_SET_OP(signotset, _sig_not) #undef _SIG_SET_OP #undef _sig_not static inline void sigemptyset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, 0, sizeof(sigset_t)); break; case 2: set->sig[1] = 0; fallthrough; case 1: set->sig[0] = 0; break; } } static inline void sigfillset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, -1, sizeof(sigset_t)); break; case 2: set->sig[1] = -1; fallthrough; case 1: set->sig[0] = -1; break; } } /* Some extensions for manipulating the low 32 signals in particular. */ static inline void sigaddsetmask(sigset_t *set, unsigned long mask) { set->sig[0] |= mask; } static inline void sigdelsetmask(sigset_t *set, unsigned long mask) { set->sig[0] &= ~mask; } static inline int sigtestsetmask(sigset_t *set, unsigned long mask) { return (set->sig[0] & mask) != 0; } static inline void siginitset(sigset_t *set, unsigned long mask) { set->sig[0] = mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = 0; break; case 1: ; } } static inline void siginitsetinv(sigset_t *set, unsigned long mask) { set->sig[0] = ~mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = -1; break; case 1: ; } } #endif /* __HAVE_ARCH_SIG_SETOPS */ static inline void init_sigpending(struct sigpending *sig) { sigemptyset(&sig->signal); INIT_LIST_HEAD(&sig->list); } extern void flush_sigqueue(struct sigpending *queue); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) { return sig <= _NSIG ? 1 : 0; } struct timespec; struct pt_regs; enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); #define SIG_KTHREAD ((__force __sighandler_t)2) #define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3) static inline void allow_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD); } static inline void allow_kernel_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know signals sent by the kernel will be handled, so that they * don't get silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD_KERNEL); } static inline void disallow_signal(int sig) { kernel_sigaction(sig, SIG_IGN); } extern struct kmem_cache *sighand_cachep; extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) * or to the process as a whole (Linux thread group). How the signal * is sent determines whether it's to one thread or the whole group, * which determines which signal mask(s) are involved in blocking it * from being delivered until later. When the signal is delivered, * either it's caught or ignored by a user handler or it has a default * effect that applies to the whole thread group (POSIX process). * * The possible effects an unblocked signal set to SIG_DFL can have are: * ignore - Nothing Happens * terminate - kill the process, i.e. all threads in the group, * similar to exit_group. The group leader (only) reports * WIFSIGNALED status to its parent. * coredump - write a core dump file describing all threads using * the same mm and then kill all those threads * stop - stop all the threads in the group, i.e. TASK_STOPPED state * * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. * Other signals when not blocked and set to SIG_DFL behaves as follows. * The job control signals also have other special effects. * * +--------------------+------------------+ * | POSIX signal | default action | * +--------------------+------------------+ * | SIGHUP | terminate | * | SIGINT | terminate | * | SIGQUIT | coredump | * | SIGILL | coredump | * | SIGTRAP | coredump | * | SIGABRT/SIGIOT | coredump | * | SIGBUS | coredump | * | SIGFPE | coredump | * | SIGKILL | terminate(+) | * | SIGUSR1 | terminate | * | SIGSEGV | coredump | * | SIGUSR2 | terminate | * | SIGPIPE | terminate | * | SIGALRM | terminate | * | SIGTERM | terminate | * | SIGCHLD | ignore | * | SIGCONT | ignore(*) | * | SIGSTOP | stop(*)(+) | * | SIGTSTP | stop(*) | * | SIGTTIN | stop(*) | * | SIGTTOU | stop(*) | * | SIGURG | ignore | * | SIGXCPU | coredump | * | SIGXFSZ | coredump | * | SIGVTALRM | terminate | * | SIGPROF | terminate | * | SIGPOLL/SIGIO | terminate | * | SIGSYS/SIGUNUSED | coredump | * | SIGSTKFLT | terminate | * | SIGWINCH | ignore | * | SIGPWR | terminate | * | SIGRTMIN-SIGRTMAX | terminate | * +--------------------+------------------+ * | non-POSIX signal | default action | * +--------------------+------------------+ * | SIGEMT | coredump | * +--------------------+------------------+ * * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". * (*) Special job control effects: * When SIGCONT is sent, it resumes the process (all threads in the group) * from TASK_STOPPED state and also clears any pending/queued stop signals * (any of those marked with "stop(*)"). This happens regardless of blocking, * catching, or ignoring SIGCONT. When any stop signal is sent, it clears * any pending/queued SIGCONT signals; this happens regardless of blocking, * catching, or ignored the stop signal, though (except for SIGSTOP) the * default action of stopping the process may happen later or never. */ #ifdef SIGEMT #define SIGEMT_MASK rt_sigmask(SIGEMT) #else #define SIGEMT_MASK 0 #endif #if SIGRTMIN > BITS_PER_LONG #define rt_sigmask(sig) (1ULL << ((sig)-1)) #else #define rt_sigmask(sig) sigmask(sig) #endif #define siginmask(sig, mask) \ ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask))) #define SIG_KERNEL_ONLY_MASK (\ rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP)) #define SIG_KERNEL_STOP_MASK (\ rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \ rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) ) #define SIG_KERNEL_COREDUMP_MASK (\ rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \ rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \ rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \ rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \ SIGEMT_MASK ) #define SIG_KERNEL_IGNORE_MASK (\ rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) ) #define SIG_SPECIFIC_SICODES_MASK (\ rt_sigmask(SIGILL) | rt_sigmask(SIGFPE) | \ rt_sigmask(SIGSEGV) | rt_sigmask(SIGBUS) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGPOLL) | rt_sigmask(SIGSYS) | \ SIGEMT_MASK ) #define sig_kernel_only(sig) siginmask(sig, SIG_KERNEL_ONLY_MASK) #define sig_kernel_coredump(sig) siginmask(sig, SIG_KERNEL_COREDUMP_MASK) #define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK) #define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK) #define sig_specific_sicodes(sig) siginmask(sig, SIG_SPECIFIC_SICODES_MASK) #define sig_fatal(t, signr) \ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) void signals_init(void); int restore_altstack(const stack_t __user *); int __save_altstack(stack_t __user *, unsigned long); #define unsafe_save_altstack(uss, sp, label) do { \ stack_t __user *__uss = uss; \ struct task_struct *t = current; \ unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \ unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \ unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); #ifdef CONFIG_DYNAMIC_SIGFRAME bool sigaltstack_size_valid(size_t ss_size); #else static inline bool sigaltstack_size_valid(size_t size) { return true; } #endif /* !CONFIG_DYNAMIC_SIGFRAME */ #ifdef CONFIG_PROC_FS struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); #endif #ifndef arch_untagged_si_addr /* * Given a fault address and a signal and si_code which correspond to the * _sigfault union member, returns the address that must appear in si_addr if * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags. */ static inline void __user *arch_untagged_si_addr(void __user *addr, unsigned long sig, unsigned long si_code) { return addr; } #endif #endif /* _LINUX_SIGNAL_H */
400 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/mmu_context.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_MMU_CONTEXT_H #define __ASM_MMU_CONTEXT_H #ifndef __ASSEMBLY__ #include <linux/compiler.h> #include <linux/sched.h> #include <linux/sched/hotplug.h> #include <linux/mm_types.h> #include <linux/pgtable.h> #include <linux/pkeys.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/gcs.h> #include <asm/proc-fns.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> extern bool rodata_full; static inline void contextidr_thread_switch(struct task_struct *next) { if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR)) return; write_sysreg(task_pid_nr(next), contextidr_el1); isb(); } /* * Set TTBR0 to reserved_pg_dir. No translations will be possible via TTBR0. */ static inline void cpu_set_reserved_ttbr0_nosync(void) { unsigned long ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); write_sysreg(ttbr, ttbr0_el1); } static inline void cpu_set_reserved_ttbr0(void) { cpu_set_reserved_ttbr0_nosync(); isb(); } void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm); static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) { BUG_ON(pgd == swapper_pg_dir); cpu_do_switch_mm(virt_to_phys(pgd),mm); } /* * TCR.T0SZ value to use when the ID map is active. */ #define idmap_t0sz TCR_T0SZ(IDMAP_VA_BITS) /* * Ensure TCR.T0SZ is set to the provided value. */ static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { unsigned long tcr = read_sysreg(tcr_el1); if ((tcr & TCR_T0SZ_MASK) == t0sz) return; tcr &= ~TCR_T0SZ_MASK; tcr |= t0sz; write_sysreg(tcr, tcr_el1); isb(); } #define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(vabits_actual)) #define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz) /* * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. * * The idmap lives in the same VA range as userspace, but uses global entries * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from * speculative TLB fetches, we must temporarily install the reserved page * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ. * * If current is a not a user task, the mm covers the TTBR1_EL1 page tables, * which should not be installed in TTBR0_EL1. In this case we can leave the * reserved page tables in place. */ static inline void cpu_uninstall_idmap(void) { struct mm_struct *mm = current->active_mm; cpu_set_reserved_ttbr0(); local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } static inline void cpu_install_idmap(void) { cpu_set_reserved_ttbr0(); local_flush_tlb_all(); cpu_set_idmap_tcr_t0sz(); cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm); } /* * Load our new page tables. A strict BBM approach requires that we ensure that * TLBs are free of any entries that may overlap with the global mappings we are * about to install. * * For a real hibernate/resume/kexec cycle TTBR0 currently points to a zero * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI runtime * services), while for a userspace-driven test_resume cycle it points to * userspace page tables (and we must point it at a zero page ourselves). * * We change T0SZ as part of installing the idmap. This is undone by * cpu_uninstall_idmap() in __cpu_suspend_exit(). */ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz) { cpu_set_reserved_ttbr0(); local_flush_tlb_all(); __cpu_set_tcr_t0sz(t0sz); /* avoid cpu_switch_mm() and its SW-PAN and CNP interactions */ write_sysreg(ttbr0, ttbr0_el1); isb(); } void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp); static inline void cpu_enable_swapper_cnp(void) { __cpu_replace_ttbr1(lm_alias(swapper_pg_dir), true); } static inline void cpu_replace_ttbr1(pgd_t *pgdp) { /* * Only for early TTBR1 replacement before cpucaps are finalized and * before we've decided whether to use CNP. */ WARN_ON(system_capabilities_finalized()); __cpu_replace_ttbr1(pgdp, false); } /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously * free an ASID allocated in a future generation. We could workaround this by * freeing the ASID from the context of the dying mm (e.g. in arch_exit_mmap), * but we'd then need to make sure that we didn't dirty any TLBs afterwards. * Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you * take CPU migration into account. */ void check_and_switch_context(struct mm_struct *mm); #define init_new_context(tsk, mm) init_new_context(tsk, mm) static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); /* pkey 0 is the default, so always reserve it. */ mm->context.pkey_allocation_map = BIT(0); return 0; } static inline void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { /* Duplicate the oldmm pkey state in mm: */ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; } static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) { } static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) { u64 ttbr; if (!system_uses_ttbr0_pan()) return; if (mm == &init_mm) ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); else ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48; WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr); } #else static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) { } #endif #define enter_lazy_tlb enter_lazy_tlb static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { /* * We don't actually care about the ttbr0 mapping, so point it at the * zero page. */ update_saved_ttbr0(tsk, &init_mm); } static inline void __switch_mm(struct mm_struct *next) { /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. */ if (next == &init_mm) { cpu_set_reserved_ttbr0(); return; } check_and_switch_context(next); } static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { if (prev != next) __switch_mm(next); /* * Update the saved TTBR0_EL1 of the scheduled-in task as the previous * value may have not been initialised yet (activate_mm caller) or the * ASID has changed since the last run (following the context switch * of another thread of the same process). */ update_saved_ttbr0(tsk, next); } static inline const struct cpumask * __task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask) { if (!static_branch_unlikely(&arm64_mismatched_32bit_el0)) return mask; if (!is_compat_thread(task_thread_info(p))) return mask; return system_32bit_el0_cpumask(); } static inline const struct cpumask * task_cpu_possible_mask(struct task_struct *p) { return __task_cpu_possible_mask(p, cpu_possible_mask); } #define task_cpu_possible_mask task_cpu_possible_mask const struct cpumask *task_cpu_fallback_mask(struct task_struct *p); void verify_cpu_asid_bits(void); void post_ttbr_update_workaround(void); unsigned long arm64_mm_context_get(struct mm_struct *mm); void arm64_mm_context_put(struct mm_struct *mm); #define mm_untag_mask mm_untag_mask static inline unsigned long mm_untag_mask(struct mm_struct *mm) { return -1UL >> 8; } /* * Only enforce protection keys on the current process, because there is no * user context to access POR_EL0 for another address space. */ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { if (!system_supports_poe()) return true; /* allow access if the VMA is not one from this process */ if (foreign || vma_is_foreign(vma)) return true; return por_el0_allows_pkey(vma_pkey(vma), write, execute); } #define deactivate_mm deactivate_mm static inline void deactivate_mm(struct task_struct *tsk, struct mm_struct *mm) { gcs_free(tsk); } #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_MMU_CONTEXT_H */
70 2 55 1 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_MAIN_H #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm #define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } #define kvm_trace_exit_reason \ ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), TP_ARGS(reason, errno), TP_STRUCT__entry( __field( __u32, reason ) __field( int, errno ) ), TP_fast_assign( __entry->reason = reason; __entry->errno = errno; ), TP_printk("reason %s (%d)", __entry->errno < 0 ? (__entry->errno == -EINTR ? "restart" : "error") : __print_symbolic(__entry->reason, kvm_trace_exit_reason), __entry->errno < 0 ? -__entry->errno : __entry->reason) ); TRACE_EVENT(kvm_vcpu_wakeup, TP_PROTO(__u64 ns, bool waited, bool valid), TP_ARGS(ns, waited, valid), TP_STRUCT__entry( __field( __u64, ns ) __field( bool, waited ) __field( bool, valid ) ), TP_fast_assign( __entry->ns = ns; __entry->waited = waited; __entry->valid = valid; ), TP_printk("%s time %lld ns, polling %s", __entry->waited ? "wait" : "poll", __entry->ns, __entry->valid ? "valid" : "invalid") ); #if defined(CONFIG_HAVE_KVM_IRQCHIP) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), TP_STRUCT__entry( __field( unsigned int, gsi ) __field( int, level ) __field( int, irq_source_id ) ), TP_fast_assign( __entry->gsi = gsi; __entry->level = level; __entry->irq_source_id = irq_source_id; ), TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #if defined(__KVM_HAVE_IOAPIC) #define kvm_deliver_mode \ {0x0, "Fixed"}, \ {0x1, "LowPrio"}, \ {0x2, "SMI"}, \ {0x3, "Res3"}, \ {0x4, "NMI"}, \ {0x5, "INIT"}, \ {0x6, "SIPI"}, \ {0x7, "ExtINT"} TRACE_EVENT(kvm_ioapic_set_irq, TP_PROTO(__u64 e, int pin, bool coalesced), TP_ARGS(e, pin, coalesced), TP_STRUCT__entry( __field( __u64, e ) __field( int, pin ) __field( bool, coalesced ) ), TP_fast_assign( __entry->e = e; __entry->pin = pin; __entry->coalesced = coalesced; ), TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "", __entry->coalesced ? " (coalesced)" : "") ); TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, TP_PROTO(__u64 e), TP_ARGS(e), TP_STRUCT__entry( __field( __u64, e ) ), TP_fast_assign( __entry->e = e; ), TP_printk("dst %x vec %u (%s|%s|%s%s)", (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "") ); TRACE_EVENT(kvm_msi_set_irq, TP_PROTO(__u64 address, __u64 data), TP_ARGS(address, data), TP_STRUCT__entry( __field( __u64, address ) __field( __u64, data ) ), TP_fast_assign( __entry->address = address; __entry->data = data; ), TP_printk("dst %llx vec %u (%s|%s|%s%s)", (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", (__entry->address & (1<<3)) ? "|rh" : "") ); #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ {KVM_IRQCHIP_IOAPIC, "IOAPIC"} #endif /* defined(__KVM_HAVE_IOAPIC) */ #if defined(CONFIG_HAVE_KVM_IRQCHIP) #ifdef kvm_irqchips #define kvm_ack_irq_string "irqchip %s pin %u" #define kvm_ack_irq_parm __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin #else #define kvm_ack_irq_string "irqchip %d pin %u" #define kvm_ack_irq_parm __entry->irqchip, __entry->pin #endif TRACE_EVENT(kvm_ack_irq, TP_PROTO(unsigned int irqchip, unsigned int pin), TP_ARGS(irqchip, pin), TP_STRUCT__entry( __field( unsigned int, irqchip ) __field( unsigned int, pin ) ), TP_fast_assign( __entry->irqchip = irqchip; __entry->pin = pin; ), TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm) ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 #define KVM_TRACE_MMIO_READ 1 #define KVM_TRACE_MMIO_WRITE 2 #define kvm_trace_symbol_mmio \ { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \ { KVM_TRACE_MMIO_READ, "read" }, \ { KVM_TRACE_MMIO_WRITE, "write" } TRACE_EVENT(kvm_mmio, TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( __field( u32, type ) __field( u32, len ) __field( u64, gpa ) __field( u64, val ) ), TP_fast_assign( __entry->type = type; __entry->len = len; __entry->gpa = gpa; __entry->val = 0; if (val) memcpy(&__entry->val, val, min_t(u32, sizeof(__entry->val), len)); ), TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", __print_symbolic(__entry->type, kvm_trace_symbol_mmio), __entry->len, __entry->gpa, __entry->val) ); #define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 #define KVM_TRACE_IOCSR_READ 1 #define KVM_TRACE_IOCSR_WRITE 2 #define kvm_trace_symbol_iocsr \ { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ { KVM_TRACE_IOCSR_READ, "read" }, \ { KVM_TRACE_IOCSR_WRITE, "write" } TRACE_EVENT(kvm_iocsr, TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( __field( u32, type ) __field( u32, len ) __field( u64, gpa ) __field( u64, val ) ), TP_fast_assign( __entry->type = type; __entry->len = len; __entry->gpa = gpa; __entry->val = 0; if (val) memcpy(&__entry->val, val, min_t(u32, sizeof(__entry->val), len)); ), TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), __entry->len, __entry->gpa, __entry->val) ); #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"} TRACE_EVENT(kvm_fpu, TP_PROTO(int load), TP_ARGS(load), TP_STRUCT__entry( __field( u32, load ) ), TP_fast_assign( __entry->load = load; ), TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) ); #ifdef CONFIG_KVM_ASYNC_PF DECLARE_EVENT_CLASS(kvm_async_get_page_class, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn), TP_STRUCT__entry( __field(__u64, gva) __field(u64, gfn) ), TP_fast_assign( __entry->gva = gva; __entry->gfn = gfn; ), TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) ); DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn) ); DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_repeated_fault, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn) ); DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva), TP_STRUCT__entry( __field(__u64, token) __field(__u64, gva) ), TP_fast_assign( __entry->token = token; __entry->gva = gva; ), TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) ); DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva) ); DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva) ); TRACE_EVENT( kvm_async_pf_completed, TP_PROTO(unsigned long address, u64 gva), TP_ARGS(address, gva), TP_STRUCT__entry( __field(unsigned long, address) __field(u64, gva) ), TP_fast_assign( __entry->address = address; __entry->gva = gva; ), TP_printk("gva %#llx address %#lx", __entry->gva, __entry->address) ); #endif TRACE_EVENT(kvm_halt_poll_ns, TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new, unsigned int old), TP_ARGS(grow, vcpu_id, new, old), TP_STRUCT__entry( __field(bool, grow) __field(unsigned int, vcpu_id) __field(unsigned int, new) __field(unsigned int, old) ), TP_fast_assign( __entry->grow = grow; __entry->vcpu_id = vcpu_id; __entry->new = new; __entry->old = old; ), TP_printk("vcpu %u: halt_poll_ns %u (%s %u)", __entry->vcpu_id, __entry->new, __entry->grow ? "grow" : "shrink", __entry->old) ); #define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(true, vcpu_id, new, old) #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(false, vcpu_id, new, old) TRACE_EVENT(kvm_dirty_ring_push, TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset), TP_ARGS(ring, slot, offset), TP_STRUCT__entry( __field(int, index) __field(u32, dirty_index) __field(u32, reset_index) __field(u32, slot) __field(u64, offset) ), TP_fast_assign( __entry->index = ring->index; __entry->dirty_index = ring->dirty_index; __entry->reset_index = ring->reset_index; __entry->slot = slot; __entry->offset = offset; ), TP_printk("ring %d: dirty 0x%x reset 0x%x " "slot %u offset 0x%llx (used %u)", __entry->index, __entry->dirty_index, __entry->reset_index, __entry->slot, __entry->offset, __entry->dirty_index - __entry->reset_index) ); TRACE_EVENT(kvm_dirty_ring_reset, TP_PROTO(struct kvm_dirty_ring *ring), TP_ARGS(ring), TP_STRUCT__entry( __field(int, index) __field(u32, dirty_index) __field(u32, reset_index) ), TP_fast_assign( __entry->index = ring->index; __entry->dirty_index = ring->dirty_index; __entry->reset_index = ring->reset_index; ), TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)", __entry->index, __entry->dirty_index, __entry->reset_index, __entry->dirty_index - __entry->reset_index) ); TRACE_EVENT(kvm_dirty_ring_exit, TP_PROTO(struct kvm_vcpu *vcpu), TP_ARGS(vcpu), TP_STRUCT__entry( __field(int, vcpu_id) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; ), TP_printk("vcpu %d", __entry->vcpu_id) ); TRACE_EVENT(kvm_unmap_hva_range, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), TP_STRUCT__entry( __field( unsigned long, start ) __field( unsigned long, end ) ), TP_fast_assign( __entry->start = start; __entry->end = end; ), TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", __entry->start, __entry->end) ); TRACE_EVENT(kvm_age_hva, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), TP_STRUCT__entry( __field( unsigned long, start ) __field( unsigned long, end ) ), TP_fast_assign( __entry->start = start; __entry->end = end; ), TP_printk("mmu notifier age hva: %#016lx -- %#016lx", __entry->start, __entry->end) ); TRACE_EVENT(kvm_test_age_hva, TP_PROTO(unsigned long hva), TP_ARGS(hva), TP_STRUCT__entry( __field( unsigned long, hva ) ), TP_fast_assign( __entry->hva = hva; ), TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
250 281 250 1 1 250 11 10 249 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options */ #ifndef _SOCK_H #define _SOCK_H #include <linux/hardirq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/timer.h> #include <linux/cache.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/mm.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> #include <linux/indirect_call_wrapper.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> #include <uapi/linux/socket.h> /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols. */ /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { spinlock_t slock; int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself): */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } socket_lock_t; struct sock; struct proto; struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { union { __addrpair skc_addrpair; struct { __be32 skc_daddr; __be32 skc_rcv_saddr; }; }; union { unsigned int skc_hash; __u16 skc_u16hashes[2]; }; /* skc_dport && skc_num must be grouped as well */ union { __portpair skc_portpair; struct { __be16 skc_dport; __u16 skc_num; }; }; unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; struct in6_addr skc_v6_rcv_saddr; #endif atomic64_t skc_cookie; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { u32 skc_rxhash; u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; /* public: */ }; struct bpf_local_storage; struct sk_filter; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_portpair __sk_common.skc_portpair #define sk_num __sk_common.skc_num #define sk_dport __sk_common.skc_dport #define sk_addrpair __sk_common.skc_addrpair #define sk_daddr __sk_common.skc_daddr #define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash __cacheline_group_begin(sock_write_rx); atomic_t sk_drops; __s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); /* early demux fields */ struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; unsigned int sk_napi_id; u16 sk_busy_poll_budget; u8 sk_prefer_busy_poll; #endif u8 sk_userlocks; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; /* private: */ struct socket_wq *sk_wq_raw; /* public: */ }; void (*sk_data_ready)(struct sock *sk); long sk_rcvtimeo; int sk_rcvlowat; __cacheline_group_end(sock_read_rx); __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; struct mem_cgroup *sk_memcg; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); socket_lock_t sk_lock; u32 sk_reserved_mem; int sk_forward_alloc; u32 sk_tsflags; __cacheline_group_end(sock_write_rxtx); __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; int sk_sndbuf; int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; union { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; struct timer_list sk_timer; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; u32 sk_mark; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, struct net_device *dev, struct sk_buff *skb); #endif u16 sk_gso_type; u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all * changes are protected by socket lock. */ u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; u16 sk_type; u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif int sk_disconnects; u8 sk_txrehash; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; void (*sk_state_change)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; }; struct sock_bh_locked { struct sock *sock; local_lock_t bh_lock; }; enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, SK_PACING_FQ = 2, }; /* flag bits in sk_user_data * * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might * not be suitable for copying when cloning the socket. For instance, * it can point to a reference counted object. sk_user_data bottom * bit is set if pointer must not be copied. * * - SK_USER_DATA_BPF: Mark whether sk_user_data field is * managed/owned by a BPF reuseport array. This bit should be set * when sk_user_data's sk is added to the bpf's reuseport_array. * * - SK_USER_DATA_PSOCK: Mark whether pointer stored in * sk_user_data points to psock type. This bit should be set * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL #define SK_USER_DATA_PSOCK 4UL #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket */ static inline bool sk_user_data_is_nocopy(const struct sock *sk) { return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); } #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) /** * __locked_read_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits * * The caller must be holding sk->sk_callback_lock. */ static inline void * __locked_read_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference_check(__sk_user_data(sk), lockdep_is_held(&sk->sk_callback_lock)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } /** * __rcu_dereference_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits */ static inline void * __rcu_dereference_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } #define rcu_dereference_sk_user_data(sk) \ __rcu_dereference_sk_user_data_with_flags(sk, 0) #define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ uintptr_t __tmp1 = (uintptr_t)(ptr), \ __tmp2 = (uintptr_t)(flags); \ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ __tmp1 | __tmp2); \ }) #define rcu_assign_sk_user_data(sk, ptr) \ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) { return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { write_pnet(&sk->sk_net, net); } /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value. */ #define SK_NO_REUSE 0 #define SK_CAN_REUSE 1 #define SK_FORCE_REUSE 2 int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(const struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); } return 0; } static inline void sk_peek_offset_bwd(struct sock *sk, int val) { s32 off = READ_ONCE(sk->sk_peek_off); if (unlikely(off >= 0)) { off = max_t(s32, off - val, 0); WRITE_ONCE(sk->sk_peek_off, off); } } static inline void sk_peek_offset_fwd(struct sock *sk, int val) { sk_peek_offset_bwd(sk, -val); } /* * Hashed lists helper routines */ static inline struct sock *sk_entry(const struct hlist_node *node) { return hlist_entry(node, struct sock, sk_node); } static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); } static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); } static inline struct sock *sk_next(const struct sock *sk) { return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) { return (!is_a_nulls(sk->sk_nulls_node.next)) ? hlist_nulls_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL; } static inline bool sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } static inline bool sk_hashed(const struct sock *sk) { return !sk_unhashed(sk); } static inline void sk_node_init(struct hlist_node *node) { node->pprev = NULL; } static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); } /* NB: equivalent to hlist_del_init_rcu */ static inline bool __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); sk_node_init(&sk->sk_node); return true; } return false; } /* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications. */ static __always_inline void sock_hold(struct sock *sk) { refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall. */ static __always_inline void __sock_put(struct sock *sk) { refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return true; } return false; } static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); } static inline void sk_add_node(struct sock *sk, struct hlist_head *list) { sock_hold(sk); __sk_add_node(sk, list); } static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&sk->sk_node, list); else hlist_add_head_rcu(&sk->sk_node, list); } static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); hlist_add_tail_rcu(&sk->sk_node, list); } static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); __sk_nulls_add_node_rcu(sk, list); } static inline void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); } static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_bind_node, list); } #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, sk_node) #define sk_nulls_for_each(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) #define sk_nulls_for_each_rcu(__sk, node, list) \ hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk) \ hlist_for_each_entry_from(__sk, sk_node) #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) #define sk_for_each_bound_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(const struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace. */ return sk->sk_socket->file->f_cred->user_ns; } /* Sock flags */ enum sock_flags { SOCK_DEAD, SOCK_DONE, SOCK_URGINLINE, SOCK_KEEPOPEN, SOCK_LINGER, SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, SOCK_ZAPPED, SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ SOCK_WIFI_STATUS, /* push wifi status to userspace */ SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead. */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) /* * The highest bit of sk_tsflags is reserved for kernel-internal * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that * SOF_TIMESTAMPING* values do not reach this reserved area */ #define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { nsk->sk_flags = osk->sk_flags; } static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); } static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); } static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, int valbool) { if (valbool) sock_set_flag(sk, bit); else sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) { return test_bit(flag, &sk->sk_flags); } #ifdef CONFIG_NET DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) { return 0; } static inline void __receive_sock(struct file *file) { } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } /* Note: If you think the test should be: * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.") */ static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } static inline void sk_wmem_queued_add(struct sock *sk, int val) { WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } static inline void sk_forward_alloc_add(struct sock *sk, int val) { /* Paired with lockless reads of sk->sk_forward_alloc */ WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); } void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ skb_dst_force(skb); if (!sk->sk_backlog.tail) WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } /* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); return qsize > limit; } /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsigned int limit) { if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; __sk_add_backlog(sk, skb); sk->sk_backlog.len += skb->truesize; return 0; } int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); return INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) { int cpu = raw_smp_processor_id(); if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in sock_rps_record_flow(). */ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS /* Paired with READ_ONCE() in sock_rps_record_flow() */ WRITE_ONCE(sk->sk_rxhash, 0); #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ *(__timeo) = wait_woken(__wait, \ TASK_INTERRUPTIBLE, \ *(__timeo)); \ } \ sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk); void __sk_flush_backlog(struct sock *sk); static inline bool sk_flush_backlog(struct sock *sk) { if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { __sk_flush_backlog(sk); return true; } return false; } int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) { if (offsetof(struct sock, sk_node.next) != 0) memset(sk, 0, offsetof(struct sock, sk_node.next)); memset(&sk->sk_node.pprev, 0, size - offsetof(struct sock, sk_node.pprev)); } struct proto_accept_arg { int flags; int err; int is_empty; bool kern; }; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); bool (*bpf_bypass_getsockopt)(int level, int optname); void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); #endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; #endif #if IS_ENABLED(CONFIG_MPTCP) int (*forward_alloc_get)(const struct sock *sk); #endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; u32 sysctl_wmem_offset; u32 sysctl_rmem_offset; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; struct module *owner; char name[32]; struct list_head node; int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline int sk_forward_alloc_get(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP) if (sk->sk_prot->forward_alloc_get) return sk->sk_prot->forward_alloc_get(sk); #endif return READ_ONCE(sk->sk_forward_alloc); } static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free, tcp_stream_memory_free, sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) { return __sk_stream_memory_free(sk, 0); } static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, struct cgroup *ancestor) { #ifdef CONFIG_SOCK_CGROUP_DATA return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), ancestor); #else return -ENOTSUPP; #endif } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int proto_sockets_allocated_sum_positive(struct proto *prot) { return percpu_counter_sum_positive(prot->sockets_allocated); } #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int all; int val[PROTO_INUSE_NR]; }; static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } static inline void sock_inuse_add(const struct net *net, int val) { this_cpu_add(net->core.prot_inuse->all, val); } int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { } static inline void sock_inuse_add(const struct net *net, int val) { } #endif /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk); } /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) /* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024 #define SHUTDOWN_MASK 3 #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 struct socket_alloc { struct socket socket; struct inode vfs_inode; }; static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } /* * Functions for memory accounting */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 /* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { return READ_ONCE(sk->sk_prot->sysctl_mem[index]); } static inline int sk_mem_pages(int amt) { return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) { /* return true if protocol supports memory accounting */ return !!sk->sk_prot->memory_allocated; } static inline bool sk_wmem_schedule(struct sock *sk, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || pfmemalloc; } static inline bool sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) { int unused_mem; if (likely(!sk->sk_reserved_mem)) return 0; unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued - atomic_read(&sk->sk_rmem_alloc); return unused_mem > 0 ? unused_mem : 0; } static inline void sk_mem_reclaim(struct sock *sk) { int reclaimable; if (!sk_has_account(sk)) return; reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); if (reclaimable >= (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable); } static inline void sk_mem_reclaim_final(struct sock *sk) { sk->sk_reserved_mem = 0; sk_mem_reclaim(sk); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class. */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \ spin_lock_nested(&((__sk)->sk_lock.slock), \ SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); /** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process won't block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled */ static inline bool lock_sock_fast(struct sock *sk) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); return __lock_sock_fast(sk); } /* fast socket lock variant for caller already holding a [different] socket lock */ static inline bool lock_sock_fast_nested(struct sock *sk) { mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); return __lock_sock_fast(sk); } /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) __releases(&sk->sk_lock.slock) { if (slow) { release_sock(sk); __release(&sk->sk_lock.slock); } else { mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); } } void sockopt_lock_sock(struct sock *sk); void sockopt_release_sock(struct sock *sk); bool sockopt_ns_capable(struct user_namespace *ns, int cap); bool sockopt_capable(int cap); /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ static inline void sock_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); #endif } static inline void sock_not_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); #endif } static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); return sk->sk_lock.owned; } static inline bool sock_owned_by_user_nocheck(const struct sock *sk) { return sk->sk_lock.owned; } static inline void sock_release_ownership(struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; return !sock_owned_by_user_nocheck(sk) && !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); if (refcount_inc_not_zero(&sk->sk_refcnt)) { skb->sk = sk; skb->destructor = sock_edemux; } } #else #define sock_edemux sock_efree #endif int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen); int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); static inline void sock_replace_proto(struct sock *sk, struct proto *proto) { if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); WRITE_ONCE(sk->sk_prot, proto); } struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; u32 ts_opt_id; u32 priority; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); /* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style. */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); /* * Default socket callbacks and setup code */ /* Initialise core socket variables using an explicit uid. */ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); /* Initialise core socket variables. * Assumes struct socket *sock is embedded in a struct socket_alloc. */ void sock_init_data(struct socket *sock, struct sock *sk); /* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too. */ /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) */ void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } static inline int sk_tx_queue_get(const struct sock *sk) { if (sk) { /* Paired with WRITE_ONCE() in sk_tx_queue_clear() * and sk_tx_queue_set(). */ int val = READ_ONCE(sk->sk_tx_queue_mapping); if (val != NO_QUEUE_MAPPING) return val; } return -1; } static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); if (force_set || unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, true); } static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, false); } static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } static inline int sk_rx_queue_get(const struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (sk) { int res = READ_ONCE(sk->sk_rx_queue_mapping); if (res != NO_QUEUE_MAPPING) return res; } #endif return -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; } static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP). */ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } kuid_t sock_i_uid(struct sock *sk); unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) { if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } return false; } static inline struct dst_entry * __sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * sk_dst_get(const struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); if (dst && !rcuref_get(&dst->__rcuref)) dst = NULL; rcu_read_unlock(); return dst; } static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->ops->negative_advice) dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) { sk_rethink_txhash(sk); __dst_negative_advice(sk); } static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst))); dst_release(old_dst); } static inline void __sk_dst_reset(struct sock *sk) { __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { sk_dst_set(sk, NULL); } struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { if (!READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); neigh_confirm(n); } } bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_gso_disable(struct sock *sk) { sk->sk_gso_disabled = 1; sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) { if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, offset); } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) return -EFAULT; return 0; } static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, int copy) { int err, offset = skb->len; err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy, offset); if (err) __skb_trim(skb, offset); return err; } static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb_len_add(skb, copy); sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } /** * sk_wmem_alloc_get - returns write allocations * @sk: socket * * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) - 1; } /** * sk_rmem_alloc_get - returns read allocations * @sk: socket * * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc); } /** * sk_has_allocations - check if allocations are outstanding * @sk: socket * * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } /** * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths:: * * CPU1 CPU2 * sys_select receive packet * ... ... * __add_wait_queue update tp->rcv_nxt * ... ... * tp->rcv_nxt check sock_def_readable * ... { * schedule rcu_read_lock(); * wq = rcu_dereference(sk->sk_wq); * if (wq && waitqueue_active(&wq->wait)) * wake_up_interruptible(&wq->wait) * ... * } * * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. * */ static inline bool skwq_has_sleeper(struct socket_wq *wq) { return wq && wq_has_sleeper(&wq->wait); } /** * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table * * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { /* Provides a barrier we need to be sure we are in sync * with the socket flags modification. * * This memory barrier is paired in the wq_has_sleeper. */ poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { /* This pairs with WRITE_ONCE() in sk_set_txhash() */ u32 txhash = READ_ONCE(sk->sk_txhash); if (txhash) { skb->l4_hash = 1; skb->hash = txhash; } } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in * and play with them. * * Inlined as it's very short and called for pretty much every * packet ever received. */ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; return true; } return false; } static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) { skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (skb) { if (sk_rmem_schedule(sk, skb, skb->truesize)) { skb_set_owner_r(skb, sk); return skb; } __kfree_skb(skb); } return NULL; } static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { skb_orphan(skb); return; } skb->slow_gro = 1; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return sock_queue_rcv_skb_reason(sk, skb, NULL); } int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically */ static inline int sock_error(struct sock *sk) { int err; /* Avoid an atomic operation for the common case. * This is racy since another cpu/thread can change sk_err under us. */ if (likely(data_race(!sk->sk_err))) return 0; err = xchg(&sk->sk_err, 0); return -err; } void sk_error_report(struct sock *sk); static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } return amt; } /* Note: * We use sk->sk_wq_raw, from contexts knowing this * pointer is not NULL and cannot disappear/change. */ static inline void sk_set_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; clear_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_wake_async(const struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) { rcu_read_lock(); sock_wake_async(rcu_dereference(sk->sk_wq), how, band); rcu_read_unlock(); } } static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) { if (unlikely(sock_flag(sk, SOCK_FASYNC))) sock_wake_async(rcu_dereference(sk->sk_wq), how, band); } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at * minimum. */ #define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) #define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) #define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE static inline void sk_stream_moderate_sndbuf(struct sock *sk) { u32 val; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); val = max_t(u32, val, sk_unused_reserved_mem(sk)); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for * optimization when we know that we're in process context and own * everything that's associated with %current. * * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { if (sk->sk_use_task_frag) return &current->task_frag; return &sk->sk_frag; } bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline gfp_t gfp_memcg_charge(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_rcvtimeo; } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_sndtimeo; } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); return v ?: 1; } /* Alas, with timeout socket operations are not restartable. * Compare this to poll(). */ static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } struct sock_skb_cb { u32 dropcount; }; /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ #define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb))) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); atomic_add(segs, &sk->sk_drops); } static inline ktime_t sock_read_timestamp(struct sock *sk) { #if BITS_PER_LONG==32 unsigned int seq; ktime_t kt; do { seq = read_seqbegin(&sk->sk_stamp_seq); kt = sk->sk_stamp; } while (read_seqretry(&sk->sk_stamp_seq, seq)); return kt; #else return READ_ONCE(sk->sk_stamp); #endif } static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) { #if BITS_PER_LONG==32 write_seqlock(&sk->sk_stamp_seq); sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else WRITE_ONCE(sk->sk_stamp, kt); #endif } void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); u32 tsflags = READ_ONCE(sk->sk_tsflags); ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested * - software time stamp available and wanted * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb)) __sock_recv_wifi_status(msg, sk, skb); } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVMARK) |\ (1UL << SOCK_RCVPRIORITY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) if (sk->sk_flags & FLAGS_RECV_CMSGS || READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ static inline void _sock_tx_timestamp(struct sock *sk, const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { __u32 tsflags = sockc->tsflags; if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { if (tsflags & SOCKCM_FLAG_TS_OPT_ID) *tskey = sockc->ts_opt_id; else *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, const struct sockcm_cookie *sockc, __u8 *tx_flags) { _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } static inline void skb_setup_tx_timestamp(struct sk_buff *skb, const struct sockcm_cookie *sockc) { _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } static inline bool sk_is_inet(const struct sock *sk) { int family = READ_ONCE(sk->sk_family); return family == AF_INET || family == AF_INET6; } static inline bool sk_is_tcp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } static inline bool sk_is_udp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP; } static inline bool sk_is_stream_unix(const struct sock *sk) { return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) { return sk->sk_family == AF_VSOCK; } /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { #ifdef CONFIG_INET return skb->destructor == sock_pfree; #else return false; #endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, * ie _not_ a timewait or request socket. */ static inline bool sk_fullsock(const struct sock *sk) { return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } static inline bool sk_is_refcounted(struct sock *sk) { /* Only full sockets have sk->sk_flags. */ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; } #endif return skb; } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ static inline bool sk_listener(const struct sock *sk) { return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) * TCP RST and ACK can be attached to TIME_WAIT. */ static inline bool sk_listener_or_tw(const struct sock *sk) { return (1 << READ_ONCE(sk->sk_state)) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); } void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap); bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; #define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) * Some wifi drivers need to tweak it to get more chunks. * They can use this helper from their ndo_start_xmit() */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device * index is either the same or that the socket is bound to an L3 * master device and the given device index is also enslaved to * that L3 master */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); if (mdif && mdif == bound_dev_if) return true; return false; } void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_mark(struct sock *sk, u32 val); void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval); int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size); int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { if (sk->sk_prot->sock_is_readable) return sk->sk_prot->sock_is_readable(sk); return false; } #endif /* _SOCK_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a * sequence number and require synchronisation vs other sender. */ if (port->type == HSR_PT_INTERLINK) { spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); } else { hsr_forward_skb(skb, port); } finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; res = netdev_upper_dev_link(dev, hsr_dev, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } list_add_tail_rcu(&port->port_list, &hsr->ports); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: kfree(port); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); } kfree_rcu(port, rcu); }
8 8 8 1 8 3 8 7 7 8 1 8 8 8 8 2 8 8 11 2 1 1 3 3 1 1 1 1 1 1 1 2 1 4 4 1 4 3 1 1 3 3 3 8 8 8 8 8 8 8 8 8 7 8 8 8 8 8 8 3 8 1 1 1 8 8 8 8 8 7 7 7 7 6 7 8 8 8 7 8 10 1 10 11 1 10 8 8 8 8 8 10 10 10 10 1 10 10 10 11 12 11 1 9 1 1 10 11 1 1 4 3 2 1 1 10 1 2 1 2 3 1 3 2 1 3 2 2 1 3 3 3 1 1 1 1 7 1 2 2 2 1 1 1 1 3 1 1 20 1 1 3 2 1 7 8 6 1 1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 // SPDX-License-Identifier: GPL-2.0-only /* * GICv3 ITS emulation * * Copyright (C) 2015,2016 ARM Ltd. * Author: Andre Przywara <andre.przywara@arm.com> */ #include <linux/cpu.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/list_sort.h> #include <linux/irqchip/arm-gic-v3.h> #include <asm/kvm_emulate.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include "vgic.h" #include "vgic-mmio.h" static struct kvm_device_ops kvm_arm_vgic_its_ops; static int vgic_its_save_tables_v0(struct vgic_its *its); static int vgic_its_restore_tables_v0(struct vgic_its *its); static int vgic_its_commit_v0(struct vgic_its *its); static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, struct kvm_vcpu *filter_vcpu, bool needs_inv); #define vgic_its_read_entry_lock(i, g, valp, t) \ ({ \ int __sz = vgic_its_get_abi(i)->t##_esz; \ struct kvm *__k = (i)->dev->kvm; \ int __ret; \ \ BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ sizeof(*(valp)) != ABI_0_ESZ); \ if (NR_ITS_ABIS > 1 && \ KVM_BUG_ON(__sz != sizeof(*(valp)), __k)) \ __ret = -EINVAL; \ else \ __ret = kvm_read_guest_lock(__k, (g), \ valp, __sz); \ __ret; \ }) #define vgic_its_write_entry_lock(i, g, val, t) \ ({ \ int __sz = vgic_its_get_abi(i)->t##_esz; \ struct kvm *__k = (i)->dev->kvm; \ typeof(val) __v = (val); \ int __ret; \ \ BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ sizeof(__v) != ABI_0_ESZ); \ if (NR_ITS_ABIS > 1 && \ KVM_BUG_ON(__sz != sizeof(__v), __k)) \ __ret = -EINVAL; \ else \ __ret = vgic_write_guest_lock(__k, (g), \ &__v, __sz); \ __ret; \ }) /* * Creates a new (reference to a) struct vgic_irq for a given LPI. * If this LPI is already mapped on another ITS, we increase its refcount * and return a pointer to the existing structure. * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. * This function returns a pointer to the _unlocked_ structure. */ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, struct kvm_vcpu *vcpu) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; unsigned long flags; int ret; /* In this case there is no put, since we keep the reference. */ if (irq) return irq; irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!irq) return ERR_PTR(-ENOMEM); ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); if (ret) { kfree(irq); return ERR_PTR(ret); } INIT_LIST_HEAD(&irq->ap_list); raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; kref_init(&irq->refcount); irq->intid = intid; irq->target_vcpu = vcpu; irq->group = 1; xa_lock_irqsave(&dist->lpi_xa, flags); /* * There could be a race with another vgic_add_lpi(), so we need to * check that we don't add a second list entry with the same LPI. */ oldirq = xa_load(&dist->lpi_xa, intid); if (vgic_try_get_irq_kref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; goto out_unlock; } ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); if (ret) { xa_release(&dist->lpi_xa, intid); kfree(irq); } out_unlock: xa_unlock_irqrestore(&dist->lpi_xa, flags); if (ret) return ERR_PTR(ret); /* * We "cache" the configuration table entries in our struct vgic_irq's. * However we only have those structs for mapped IRQs, so we read in * the respective config data from memory here upon mapping the LPI. * * Should any of these fail, behave as if we couldn't create the LPI * by dropping the refcount and returning the error. */ ret = update_lpi_config(kvm, irq, NULL, false); if (ret) { vgic_put_irq(kvm, irq); return ERR_PTR(ret); } ret = vgic_v3_lpi_sync_pending_status(kvm, irq); if (ret) { vgic_put_irq(kvm, irq); return ERR_PTR(ret); } return irq; } struct its_device { struct list_head dev_list; /* the head for the list of ITTEs */ struct list_head itt_head; u32 num_eventid_bits; gpa_t itt_addr; u32 device_id; }; #define COLLECTION_NOT_MAPPED ((u32)~0) struct its_collection { struct list_head coll_list; u32 collection_id; u32 target_addr; }; #define its_is_collection_mapped(coll) ((coll) && \ ((coll)->target_addr != COLLECTION_NOT_MAPPED)) struct its_ite { struct list_head ite_list; struct vgic_irq *irq; struct its_collection *collection; u32 event_id; }; /** * struct vgic_its_abi - ITS abi ops and settings * @cte_esz: collection table entry size * @dte_esz: device table entry size * @ite_esz: interrupt translation table entry size * @save_tables: save the ITS tables into guest RAM * @restore_tables: restore the ITS internal structs from tables * stored in guest RAM * @commit: initialize the registers which expose the ABI settings, * especially the entry sizes */ struct vgic_its_abi { int cte_esz; int dte_esz; int ite_esz; int (*save_tables)(struct vgic_its *its); int (*restore_tables)(struct vgic_its *its); int (*commit)(struct vgic_its *its); }; #define ABI_0_ESZ 8 #define ESZ_MAX ABI_0_ESZ static const struct vgic_its_abi its_table_abi_versions[] = { [0] = { .cte_esz = ABI_0_ESZ, .dte_esz = ABI_0_ESZ, .ite_esz = ABI_0_ESZ, .save_tables = vgic_its_save_tables_v0, .restore_tables = vgic_its_restore_tables_v0, .commit = vgic_its_commit_v0, }, }; #define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions) inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) { return &its_table_abi_versions[its->abi_rev]; } static int vgic_its_set_abi(struct vgic_its *its, u32 rev) { const struct vgic_its_abi *abi; its->abi_rev = rev; abi = vgic_its_get_abi(its); return abi->commit(its); } /* * Find and returns a device in the device table for an ITS. * Must be called with the its_lock mutex held. */ static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) { struct its_device *device; list_for_each_entry(device, &its->device_list, dev_list) if (device_id == device->device_id) return device; return NULL; } /* * Find and returns an interrupt translation table entry (ITTE) for a given * Device ID/Event ID pair on an ITS. * Must be called with the its_lock mutex held. */ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, u32 event_id) { struct its_device *device; struct its_ite *ite; device = find_its_device(its, device_id); if (device == NULL) return NULL; list_for_each_entry(ite, &device->itt_head, ite_list) if (ite->event_id == event_id) return ite; return NULL; } /* To be used as an iterator this macro misses the enclosing parentheses */ #define for_each_lpi_its(dev, ite, its) \ list_for_each_entry(dev, &(its)->device_list, dev_list) \ list_for_each_entry(ite, &(dev)->itt_head, ite_list) #define GIC_LPI_OFFSET 8192 #define VITS_TYPER_IDBITS 16 #define VITS_MAX_EVENTID (BIT(VITS_TYPER_IDBITS) - 1) #define VITS_TYPER_DEVBITS 16 #define VITS_MAX_DEVID (BIT(VITS_TYPER_DEVBITS) - 1) #define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) #define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) /* * Finds and returns a collection in the ITS collection table. * Must be called with the its_lock mutex held. */ static struct its_collection *find_collection(struct vgic_its *its, int coll_id) { struct its_collection *collection; list_for_each_entry(collection, &its->collection_list, coll_list) { if (coll_id == collection->collection_id) return collection; } return NULL; } #define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) #define LPI_PROP_PRIORITY(p) ((p) & 0xfc) /* * Reads the configuration data for a given LPI from guest memory and * updates the fields in struct vgic_irq. * If filter_vcpu is not NULL, applies only if the IRQ is targeting this * VCPU. Unconditionally applies if filter_vcpu is NULL. */ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, struct kvm_vcpu *filter_vcpu, bool needs_inv) { u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); u8 prop; int ret; unsigned long flags; ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, &prop, 1); if (ret) return ret; raw_spin_lock_irqsave(&irq->irq_lock, flags); if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { irq->priority = LPI_PROP_PRIORITY(prop); irq->enabled = LPI_PROP_ENABLE_BIT(prop); if (!irq->hw) { vgic_queue_irq_unlock(kvm, irq, flags); return 0; } } raw_spin_unlock_irqrestore(&irq->irq_lock, flags); if (irq->hw) return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); return 0; } static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) { int ret = 0; unsigned long flags; raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->target_vcpu = vcpu; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); if (irq->hw) { struct its_vlpi_map map; ret = its_get_vlpi(irq->host_irq, &map); if (ret) return ret; if (map.vpe) atomic_dec(&map.vpe->vlpi_count); map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; atomic_inc(&map.vpe->vlpi_count); ret = its_map_vlpi(irq->host_irq, &map); } return ret; } static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm, struct its_collection *col) { return kvm_get_vcpu_by_id(kvm, col->target_addr); } /* * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI * is targeting) to the VGIC's view, which deals with target VCPUs. * Needs to be called whenever either the collection for a LPIs has * changed or the collection itself got retargeted. */ static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) { struct kvm_vcpu *vcpu; if (!its_is_collection_mapped(ite->collection)) return; vcpu = collection_to_vcpu(kvm, ite->collection); update_affinity(ite->irq, vcpu); } /* * Updates the target VCPU for every LPI targeting this collection. * Must be called with the its_lock mutex held. */ static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, struct its_collection *coll) { struct its_device *device; struct its_ite *ite; for_each_lpi_its(device, ite, its) { if (ite->collection != coll) continue; update_affinity_ite(kvm, ite); } } static u32 max_lpis_propbaser(u64 propbaser) { int nr_idbits = (propbaser & 0x1f) + 1; return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); } /* * Sync the pending table pending bit of LPIs targeting @vcpu * with our own data structures. This relies on the LPI being * mapped before. */ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) { gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); struct vgic_dist *dist = &vcpu->kvm->arch.vgic; unsigned long intid, flags; struct vgic_irq *irq; int last_byte_offset = -1; int ret = 0; u8 pendmask; xa_for_each(&dist->lpi_xa, intid, irq) { int byte_offset, bit_nr; byte_offset = intid / BITS_PER_BYTE; bit_nr = intid % BITS_PER_BYTE; /* * For contiguously allocated LPIs chances are we just read * this very same byte in the last iteration. Reuse that. */ if (byte_offset != last_byte_offset) { ret = kvm_read_guest_lock(vcpu->kvm, pendbase + byte_offset, &pendmask, 1); if (ret) return ret; last_byte_offset = byte_offset; } irq = vgic_get_irq(vcpu->kvm, intid); if (!irq) continue; raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->target_vcpu == vcpu) irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } return ret; } static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 reg = GITS_TYPER_PLPIS; /* * We use linear CPU numbers for redistributor addressing, * so GITS_TYPER.PTA is 0. * Also we force all PROPBASER registers to be the same, so * CommonLPIAff is 0 as well. * To avoid memory waste in the guest, we keep the number of IDBits and * DevBits low - as least for the time being. */ reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT; reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT; reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT; return extract_bytes(reg, addr & 7, len); } static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { u32 val; val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK; val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM; return val; } static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { u32 rev = GITS_IIDR_REV(val); if (rev >= NR_ITS_ABIS) return -EINVAL; return vgic_its_set_abi(its, rev); } static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { switch (addr & 0xffff) { case GITS_PIDR0: return 0x92; /* part number, bits[7:0] */ case GITS_PIDR1: return 0xb4; /* part number, bits[11:8] */ case GITS_PIDR2: return GIC_PIDR2_ARCH_GICv3 | 0x0b; case GITS_PIDR4: return 0x40; /* This is a 64K software visible page */ /* The following are the ID registers for (any) GIC. */ case GITS_CIDR0: return 0x0d; case GITS_CIDR1: return 0xf0; case GITS_CIDR2: return 0x05; case GITS_CIDR3: return 0xb1; } return 0; } static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db) { struct kvm_io_device *kvm_io_dev; struct vgic_io_device *iodev; kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db); if (!kvm_io_dev) return ERR_PTR(-EINVAL); if (kvm_io_dev->ops != &kvm_io_gic_ops) return ERR_PTR(-EINVAL); iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); if (iodev->iodev_type != IODEV_ITS) return ERR_PTR(-EINVAL); return iodev->its; } static unsigned long vgic_its_cache_key(u32 devid, u32 eventid) { return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid; } static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, u32 devid, u32 eventid) { unsigned long cache_key = vgic_its_cache_key(devid, eventid); struct vgic_its *its; struct vgic_irq *irq; if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID) return NULL; its = __vgic_doorbell_to_its(kvm, db); if (IS_ERR(its)) return NULL; rcu_read_lock(); irq = xa_load(&its->translation_cache, cache_key); if (!vgic_try_get_irq_kref(irq)) irq = NULL; rcu_read_unlock(); return irq; } static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq *irq) { unsigned long cache_key = vgic_its_cache_key(devid, eventid); struct vgic_irq *old; /* Do not cache a directly injected interrupt */ if (irq->hw) return; /* * The irq refcount is guaranteed to be nonzero while holding the * its_lock, as the ITE (and the reference it holds) cannot be freed. */ lockdep_assert_held(&its->its_lock); vgic_get_irq_kref(irq); old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); /* * Put the reference taken on @irq if the store fails. Intentionally do * not return the error as the translation cache is best effort. */ if (xa_is_err(old)) { vgic_put_irq(kvm, irq); return; } /* * We could have raced with another CPU caching the same * translation behind our back, ensure we don't leak a * reference if that is the case. */ if (old) vgic_put_irq(kvm, old); } static void vgic_its_invalidate_cache(struct vgic_its *its) { struct kvm *kvm = its->dev->kvm; struct vgic_irq *irq; unsigned long idx; xa_for_each(&its->translation_cache, idx, irq) { xa_erase(&its->translation_cache, idx); vgic_put_irq(kvm, irq); } } void vgic_its_invalidate_all_caches(struct kvm *kvm) { struct kvm_device *dev; struct vgic_its *its; rcu_read_lock(); list_for_each_entry_rcu(dev, &kvm->devices, vm_node) { if (dev->ops != &kvm_arm_vgic_its_ops) continue; its = dev->private; vgic_its_invalidate_cache(its); } rcu_read_unlock(); } int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq) { struct kvm_vcpu *vcpu; struct its_ite *ite; if (!its->enabled) return -EBUSY; ite = find_ite(its, devid, eventid); if (!ite || !its_is_collection_mapped(ite->collection)) return E_ITS_INT_UNMAPPED_INTERRUPT; vcpu = collection_to_vcpu(kvm, ite->collection); if (!vcpu) return E_ITS_INT_UNMAPPED_INTERRUPT; if (!vgic_lpis_enabled(vcpu)) return -EBUSY; vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); *irq = ite->irq; return 0; } struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) { u64 address; if (!vgic_has_its(kvm)) return ERR_PTR(-ENODEV); if (!(msi->flags & KVM_MSI_VALID_DEVID)) return ERR_PTR(-EINVAL); address = (u64)msi->address_hi << 32 | msi->address_lo; return __vgic_doorbell_to_its(kvm, address); } /* * Find the target VCPU and the LPI number for a given devid/eventid pair * and make this IRQ pending, possibly injecting it. * Must be called with the its_lock mutex held. * Returns 0 on success, a positive error value for any ITS mapping * related errors and negative error values for generic errors. */ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid) { struct vgic_irq *irq = NULL; unsigned long flags; int err; err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); if (err) return err; if (irq->hw) return irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, true); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; vgic_queue_irq_unlock(kvm, irq, flags); return 0; } int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) { struct vgic_irq *irq; unsigned long flags; phys_addr_t db; db = (u64)msi->address_hi << 32 | msi->address_lo; irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data); if (!irq) return -EWOULDBLOCK; raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; vgic_queue_irq_unlock(kvm, irq, flags); vgic_put_irq(kvm, irq); return 0; } /* * Queries the KVM IO bus framework to get the ITS pointer from the given * doorbell address. * We then call vgic_its_trigger_msi() with the decoded data. * According to the KVM_SIGNAL_MSI API description returns 1 on success. */ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) { struct vgic_its *its; int ret; if (!vgic_its_inject_cached_translation(kvm, msi)) return 1; its = vgic_msi_to_its(kvm, msi); if (IS_ERR(its)) return PTR_ERR(its); mutex_lock(&its->its_lock); ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); mutex_unlock(&its->its_lock); if (ret < 0) return ret; /* * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 * if the guest has blocked the MSI. So we map any LPI mapping * related error to that. */ if (ret) return 0; else return 1; } /* Requires the its_lock to be held. */ static void its_free_ite(struct kvm *kvm, struct its_ite *ite) { list_del(&ite->ite_list); /* This put matches the get in vgic_add_lpi. */ if (ite->irq) { if (ite->irq->hw) WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); vgic_put_irq(kvm, ite->irq); } kfree(ite); } static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) { return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); } #define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) #define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) #define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1) #define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) #define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) #define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) #define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8) #define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) #define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) /* * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); u32 event_id = its_cmd_get_id(its_cmd); struct its_ite *ite; ite = find_ite(its, device_id, event_id); if (ite && its_is_collection_mapped(ite->collection)) { struct its_device *device = find_its_device(its, device_id); int ite_esz = vgic_its_get_abi(its)->ite_esz; gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the * pending state is a property of the ITTE struct. */ vgic_its_invalidate_cache(its); its_free_ite(kvm, ite); return vgic_its_write_entry_lock(its, gpa, 0ULL, ite); } return E_ITS_DISCARD_UNMAPPED_INTERRUPT; } /* * The MOVI command moves an ITTE to a different collection. * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); u32 event_id = its_cmd_get_id(its_cmd); u32 coll_id = its_cmd_get_collection(its_cmd); struct kvm_vcpu *vcpu; struct its_ite *ite; struct its_collection *collection; ite = find_ite(its, device_id, event_id); if (!ite) return E_ITS_MOVI_UNMAPPED_INTERRUPT; if (!its_is_collection_mapped(ite->collection)) return E_ITS_MOVI_UNMAPPED_COLLECTION; collection = find_collection(its, coll_id); if (!its_is_collection_mapped(collection)) return E_ITS_MOVI_UNMAPPED_COLLECTION; ite->collection = collection; vcpu = collection_to_vcpu(kvm, collection); vgic_its_invalidate_cache(its); return update_affinity(ite->irq, vcpu); } static bool __is_visible_gfn_locked(struct vgic_its *its, gpa_t gpa) { gfn_t gfn = gpa >> PAGE_SHIFT; int idx; bool ret; idx = srcu_read_lock(&its->dev->kvm->srcu); ret = kvm_is_visible_gfn(its->dev->kvm, gfn); srcu_read_unlock(&its->dev->kvm->srcu, idx); return ret; } /* * Check whether an ID can be stored into the corresponding guest table. * For a direct table this is pretty easy, but gets a bit nasty for * indirect tables. We check whether the resulting guest physical address * is actually valid (covered by a memslot and guest accessible). * For this we have to read the respective first level entry. */ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, gpa_t *eaddr) { int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; u64 indirect_ptr, type = GITS_BASER_TYPE(baser); phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); int index; switch (type) { case GITS_BASER_TYPE_DEVICE: if (id > VITS_MAX_DEVID) return false; break; case GITS_BASER_TYPE_COLLECTION: /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */ if (id >= BIT_ULL(16)) return false; break; default: return false; } if (!(baser & GITS_BASER_INDIRECT)) { phys_addr_t addr; if (id >= (l1_tbl_size / esz)) return false; addr = base + id * esz; if (eaddr) *eaddr = addr; return __is_visible_gfn_locked(its, addr); } /* calculate and check the index into the 1st level */ index = id / (SZ_64K / esz); if (index >= (l1_tbl_size / sizeof(u64))) return false; /* Each 1st level entry is represented by a 64-bit value. */ if (kvm_read_guest_lock(its->dev->kvm, base + index * sizeof(indirect_ptr), &indirect_ptr, sizeof(indirect_ptr))) return false; indirect_ptr = le64_to_cpu(indirect_ptr); /* check the valid bit of the first level entry */ if (!(indirect_ptr & BIT_ULL(63))) return false; /* Mask the guest physical address and calculate the frame number. */ indirect_ptr &= GENMASK_ULL(51, 16); /* Find the address of the actual entry */ index = id % (SZ_64K / esz); indirect_ptr += index * esz; if (eaddr) *eaddr = indirect_ptr; return __is_visible_gfn_locked(its, indirect_ptr); } /* * Check whether an event ID can be stored in the corresponding Interrupt * Translation Table, which starts at device->itt_addr. */ static bool vgic_its_check_event_id(struct vgic_its *its, struct its_device *device, u32 event_id) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); int ite_esz = abi->ite_esz; gpa_t gpa; /* max table size is: BIT_ULL(device->num_eventid_bits) * ite_esz */ if (event_id >= BIT_ULL(device->num_eventid_bits)) return false; gpa = device->itt_addr + event_id * ite_esz; return __is_visible_gfn_locked(its, gpa); } /* * Add a new collection into the ITS collection table. * Returns 0 on success, and a negative error value for generic errors. */ static int vgic_its_alloc_collection(struct vgic_its *its, struct its_collection **colp, u32 coll_id) { struct its_collection *collection; collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT); if (!collection) return -ENOMEM; collection->collection_id = coll_id; collection->target_addr = COLLECTION_NOT_MAPPED; list_add_tail(&collection->coll_list, &its->collection_list); *colp = collection; return 0; } static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) { struct its_collection *collection; struct its_device *device; struct its_ite *ite; /* * Clearing the mapping for that collection ID removes the * entry from the list. If there wasn't any before, we can * go home early. */ collection = find_collection(its, coll_id); if (!collection) return; for_each_lpi_its(device, ite, its) if (ite->collection && ite->collection->collection_id == coll_id) ite->collection = NULL; list_del(&collection->coll_list); kfree(collection); } /* Must be called with its_lock mutex held */ static struct its_ite *vgic_its_alloc_ite(struct its_device *device, struct its_collection *collection, u32 event_id) { struct its_ite *ite; ite = kzalloc(sizeof(*ite), GFP_KERNEL_ACCOUNT); if (!ite) return ERR_PTR(-ENOMEM); ite->event_id = event_id; ite->collection = collection; list_add_tail(&ite->ite_list, &device->itt_head); return ite; } /* * The MAPTI and MAPI commands map LPIs to ITTEs. * Must be called with its_lock mutex held. */ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); u32 event_id = its_cmd_get_id(its_cmd); u32 coll_id = its_cmd_get_collection(its_cmd); struct its_ite *ite; struct kvm_vcpu *vcpu = NULL; struct its_device *device; struct its_collection *collection, *new_coll = NULL; struct vgic_irq *irq; int lpi_nr; device = find_its_device(its, device_id); if (!device) return E_ITS_MAPTI_UNMAPPED_DEVICE; if (!vgic_its_check_event_id(its, device, event_id)) return E_ITS_MAPTI_ID_OOR; if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) lpi_nr = its_cmd_get_physical_id(its_cmd); else lpi_nr = event_id; if (lpi_nr < GIC_LPI_OFFSET || lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) return E_ITS_MAPTI_PHYSICALID_OOR; /* If there is an existing mapping, behavior is UNPREDICTABLE. */ if (find_ite(its, device_id, event_id)) return 0; collection = find_collection(its, coll_id); if (!collection) { int ret; if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) return E_ITS_MAPC_COLLECTION_OOR; ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; new_coll = collection; } ite = vgic_its_alloc_ite(device, collection, event_id); if (IS_ERR(ite)) { if (new_coll) vgic_its_free_collection(its, coll_id); return PTR_ERR(ite); } if (its_is_collection_mapped(collection)) vcpu = collection_to_vcpu(kvm, collection); irq = vgic_add_lpi(kvm, lpi_nr, vcpu); if (IS_ERR(irq)) { if (new_coll) vgic_its_free_collection(its, coll_id); its_free_ite(kvm, ite); return PTR_ERR(irq); } ite->irq = irq; return 0; } /* Requires the its_lock to be held. */ static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its, struct its_device *device) { struct its_ite *ite, *temp; /* * The spec says that unmapping a device with still valid * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, * since we cannot leave the memory unreferenced. */ list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) its_free_ite(kvm, ite); vgic_its_invalidate_cache(its); list_del(&device->dev_list); kfree(device); } /* its lock must be held */ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) { struct its_device *cur, *temp; list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) vgic_its_free_device(kvm, its, cur); } /* its lock must be held */ static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) { struct its_collection *cur, *temp; list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) vgic_its_free_collection(its, cur->collection_id); } /* Must be called with its_lock mutex held */ static struct its_device *vgic_its_alloc_device(struct vgic_its *its, u32 device_id, gpa_t itt_addr, u8 num_eventid_bits) { struct its_device *device; device = kzalloc(sizeof(*device), GFP_KERNEL_ACCOUNT); if (!device) return ERR_PTR(-ENOMEM); device->device_id = device_id; device->itt_addr = itt_addr; device->num_eventid_bits = num_eventid_bits; INIT_LIST_HEAD(&device->itt_head); list_add_tail(&device->dev_list, &its->device_list); return device; } /* * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); bool valid = its_cmd_get_validbit(its_cmd); u8 num_eventid_bits = its_cmd_get_size(its_cmd); gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); struct its_device *device; gpa_t gpa; if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) return E_ITS_MAPD_DEVICE_OOR; if (valid && num_eventid_bits > VITS_TYPER_IDBITS) return E_ITS_MAPD_ITTSIZE_OOR; device = find_its_device(its, device_id); /* * The spec says that calling MAPD on an already mapped device * invalidates all cached data for this device. We implement this * by removing the mapping and re-establishing it. */ if (device) vgic_its_free_device(kvm, its, device); /* * The spec does not say whether unmapping a not-mapped device * is an error, so we are done in any case. */ if (!valid) return vgic_its_write_entry_lock(its, gpa, 0ULL, dte); device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); return PTR_ERR_OR_ZERO(device); } /* * The MAPC command maps collection IDs to redistributors. * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u16 coll_id; struct its_collection *collection; bool valid; valid = its_cmd_get_validbit(its_cmd); coll_id = its_cmd_get_collection(its_cmd); if (!valid) { vgic_its_free_collection(its, coll_id); vgic_its_invalidate_cache(its); } else { struct kvm_vcpu *vcpu; vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); if (!vcpu) return E_ITS_MAPC_PROCNUM_OOR; collection = find_collection(its, coll_id); if (!collection) { int ret; if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) return E_ITS_MAPC_COLLECTION_OOR; ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; collection->target_addr = vcpu->vcpu_id; } else { collection->target_addr = vcpu->vcpu_id; update_affinity_collection(kvm, its, collection); } } return 0; } /* * The CLEAR command removes the pending state for a particular LPI. * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); u32 event_id = its_cmd_get_id(its_cmd); struct its_ite *ite; ite = find_ite(its, device_id, event_id); if (!ite) return E_ITS_CLEAR_UNMAPPED_INTERRUPT; ite->irq->pending_latch = false; if (ite->irq->hw) return irq_set_irqchip_state(ite->irq->host_irq, IRQCHIP_STATE_PENDING, false); return 0; } int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq) { return update_lpi_config(kvm, irq, NULL, true); } /* * The INV command syncs the configuration bits from the memory table. * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); u32 event_id = its_cmd_get_id(its_cmd); struct its_ite *ite; ite = find_ite(its, device_id, event_id); if (!ite) return E_ITS_INV_UNMAPPED_INTERRUPT; return vgic_its_inv_lpi(kvm, ite->irq); } /** * vgic_its_invall - invalidate all LPIs targeting a given vcpu * @vcpu: the vcpu for which the RD is targeted by an invalidation * * Contrary to the INVALL command, this targets a RD instead of a * collection, and we don't need to hold the its_lock, since no ITS is * involved here. */ int vgic_its_invall(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; unsigned long intid; xa_for_each(&dist->lpi_xa, intid, irq) { irq = vgic_get_irq(kvm, intid); if (!irq) continue; update_lpi_config(kvm, irq, vcpu, false); vgic_put_irq(kvm, irq); } if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); return 0; } /* * The INVALL command requests flushing of all IRQ data in this collection. * Find the VCPU mapped to that collection, then iterate over the VM's list * of mapped LPIs and update the configuration for each IRQ which targets * the specified vcpu. The configuration will be read from the in-memory * configuration table. * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 coll_id = its_cmd_get_collection(its_cmd); struct its_collection *collection; struct kvm_vcpu *vcpu; collection = find_collection(its, coll_id); if (!its_is_collection_mapped(collection)) return E_ITS_INVALL_UNMAPPED_COLLECTION; vcpu = collection_to_vcpu(kvm, collection); vgic_its_invall(vcpu); return 0; } /* * The MOVALL command moves the pending state of all IRQs targeting one * redistributor to another. We don't hold the pending state in the VCPUs, * but in the IRQs instead, so there is really not much to do for us here. * However the spec says that no IRQ must target the old redistributor * afterwards, so we make sure that no LPI is using the associated target_vcpu. * This command affects all LPIs in the system that target that redistributor. */ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu1, *vcpu2; struct vgic_irq *irq; unsigned long intid; /* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */ vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); vcpu2 = kvm_get_vcpu_by_id(kvm, its_cmd_mask_field(its_cmd, 3, 16, 32)); if (!vcpu1 || !vcpu2) return E_ITS_MOVALL_PROCNUM_OOR; if (vcpu1 == vcpu2) return 0; xa_for_each(&dist->lpi_xa, intid, irq) { irq = vgic_get_irq(kvm, intid); if (!irq) continue; update_affinity(irq, vcpu2); vgic_put_irq(kvm, irq); } vgic_its_invalidate_cache(its); return 0; } /* * The INT command injects the LPI associated with that DevID/EvID pair. * Must be called with the its_lock mutex held. */ static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u32 msi_data = its_cmd_get_id(its_cmd); u64 msi_devid = its_cmd_get_deviceid(its_cmd); return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); } /* * This function is called with the its_cmd lock held, but the ITS data * structure lock dropped. */ static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { int ret = -ENODEV; mutex_lock(&its->its_lock); switch (its_cmd_get_command(its_cmd)) { case GITS_CMD_MAPD: ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); break; case GITS_CMD_MAPC: ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); break; case GITS_CMD_MAPI: ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); break; case GITS_CMD_MAPTI: ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); break; case GITS_CMD_MOVI: ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); break; case GITS_CMD_DISCARD: ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); break; case GITS_CMD_CLEAR: ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); break; case GITS_CMD_MOVALL: ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); break; case GITS_CMD_INT: ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); break; case GITS_CMD_INV: ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); break; case GITS_CMD_INVALL: ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); break; case GITS_CMD_SYNC: /* we ignore this command: we are in sync all of the time */ ret = 0; break; } mutex_unlock(&its->its_lock); return ret; } static u64 vgic_sanitise_its_baser(u64 reg) { reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, GITS_BASER_SHAREABILITY_SHIFT, vgic_sanitise_shareability); reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, GITS_BASER_INNER_CACHEABILITY_SHIFT, vgic_sanitise_inner_cacheability); reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, GITS_BASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); /* We support only one (ITS) page size: 64K */ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; return reg; } static u64 vgic_sanitise_its_cbaser(u64 reg) { reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, GITS_CBASER_SHAREABILITY_SHIFT, vgic_sanitise_shareability); reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, GITS_CBASER_INNER_CACHEABILITY_SHIFT, vgic_sanitise_inner_cacheability); reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, GITS_CBASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); /* Sanitise the physical address to be 64k aligned. */ reg &= ~GENMASK_ULL(15, 12); return reg; } static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { return extract_bytes(its->cbaser, addr & 7, len); } static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { /* When GITS_CTLR.Enable is 1, this register is RO. */ if (its->enabled) return; mutex_lock(&its->cmd_lock); its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); its->creadr = 0; /* * CWRITER is architecturally UNKNOWN on reset, but we need to reset * it to CREADR to make sure we start with an empty command buffer. */ its->cwriter = its->creadr; mutex_unlock(&its->cmd_lock); } #define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) #define ITS_CMD_SIZE 32 #define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) /* Must be called with the cmd_lock held. */ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) { gpa_t cbaser; u64 cmd_buf[4]; /* Commands are only processed when the ITS is enabled. */ if (!its->enabled) return; cbaser = GITS_CBASER_ADDRESS(its->cbaser); while (its->cwriter != its->creadr) { int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, cmd_buf, ITS_CMD_SIZE); /* * If kvm_read_guest() fails, this could be due to the guest * programming a bogus value in CBASER or something else going * wrong from which we cannot easily recover. * According to section 6.3.2 in the GICv3 spec we can just * ignore that command then. */ if (!ret) vgic_its_handle_command(kvm, its, cmd_buf); its->creadr += ITS_CMD_SIZE; if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) its->creadr = 0; } } /* * By writing to CWRITER the guest announces new commands to be processed. * To avoid any races in the first place, we take the its_cmd lock, which * protects our ring buffer variables, so that there is only one user * per ITS handling commands at a given time. */ static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { u64 reg; if (!its) return; mutex_lock(&its->cmd_lock); reg = update_64bit_reg(its->cwriter, addr & 7, len, val); reg = ITS_CMD_OFFSET(reg); if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { mutex_unlock(&its->cmd_lock); return; } its->cwriter = reg; vgic_its_process_commands(kvm, its); mutex_unlock(&its->cmd_lock); } static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { return extract_bytes(its->cwriter, addr & 0x7, len); } static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { return extract_bytes(its->creadr, addr & 0x7, len); } static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { u32 cmd_offset; int ret = 0; mutex_lock(&its->cmd_lock); if (its->enabled) { ret = -EBUSY; goto out; } cmd_offset = ITS_CMD_OFFSET(val); if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { ret = -EINVAL; goto out; } its->creadr = cmd_offset; out: mutex_unlock(&its->cmd_lock); return ret; } #define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) { u64 reg; switch (BASER_INDEX(addr)) { case 0: reg = its->baser_device_table; break; case 1: reg = its->baser_coll_table; break; default: reg = 0; break; } return extract_bytes(reg, addr & 7, len); } #define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) static void vgic_mmio_write_its_baser(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 entry_size, table_type; u64 reg, *regptr, clearbits = 0; /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ if (its->enabled) return; switch (BASER_INDEX(addr)) { case 0: regptr = &its->baser_device_table; entry_size = abi->dte_esz; table_type = GITS_BASER_TYPE_DEVICE; break; case 1: regptr = &its->baser_coll_table; entry_size = abi->cte_esz; table_type = GITS_BASER_TYPE_COLLECTION; clearbits = GITS_BASER_INDIRECT; break; default: return; } reg = update_64bit_reg(*regptr, addr & 7, len, val); reg &= ~GITS_BASER_RO_MASK; reg &= ~clearbits; reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; reg |= table_type << GITS_BASER_TYPE_SHIFT; reg = vgic_sanitise_its_baser(reg); *regptr = reg; if (!(reg & GITS_BASER_VALID)) { /* Take the its_lock to prevent a race with a save/restore */ mutex_lock(&its->its_lock); switch (table_type) { case GITS_BASER_TYPE_DEVICE: vgic_its_free_device_list(kvm, its); break; case GITS_BASER_TYPE_COLLECTION: vgic_its_free_collection_list(kvm, its); break; } mutex_unlock(&its->its_lock); } } static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, struct vgic_its *its, gpa_t addr, unsigned int len) { u32 reg = 0; mutex_lock(&its->cmd_lock); if (its->creadr == its->cwriter) reg |= GITS_CTLR_QUIESCENT; if (its->enabled) reg |= GITS_CTLR_ENABLE; mutex_unlock(&its->cmd_lock); return reg; } static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { mutex_lock(&its->cmd_lock); /* * It is UNPREDICTABLE to enable the ITS if any of the CBASER or * device/collection BASER are invalid */ if (!its->enabled && (val & GITS_CTLR_ENABLE) && (!(its->baser_device_table & GITS_BASER_VALID) || !(its->baser_coll_table & GITS_BASER_VALID) || !(its->cbaser & GITS_CBASER_VALID))) goto out; its->enabled = !!(val & GITS_CTLR_ENABLE); if (!its->enabled) vgic_its_invalidate_cache(its); /* * Try to process any pending commands. This function bails out early * if the ITS is disabled or no commands have been queued. */ vgic_its_process_commands(kvm, its); out: mutex_unlock(&its->cmd_lock); } #define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ { \ .reg_offset = off, \ .len = length, \ .access_flags = acc, \ .its_read = rd, \ .its_write = wr, \ } #define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\ { \ .reg_offset = off, \ .len = length, \ .access_flags = acc, \ .its_read = rd, \ .its_write = wr, \ .uaccess_its_write = uwr, \ } static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { /* Ignore */ } static struct vgic_register_region its_registers[] = { REGISTER_ITS_DESC(GITS_CTLR, vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, VGIC_ACCESS_32bit), REGISTER_ITS_DESC_UACCESS(GITS_IIDR, vgic_mmio_read_its_iidr, its_mmio_write_wi, vgic_mmio_uaccess_write_its_iidr, 4, VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_TYPER, vgic_mmio_read_its_typer, its_mmio_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_CBASER, vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_CWRITER, vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC_UACCESS(GITS_CREADR, vgic_mmio_read_its_creadr, its_mmio_write_wi, vgic_mmio_uaccess_write_its_creadr, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_BASER, vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_IDREGS_BASE, vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, VGIC_ACCESS_32bit), }; /* This is called on setting the LPI enable bit in the redistributor. */ void vgic_enable_lpis(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) its_sync_lpi_pending_table(vcpu); } static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, u64 addr) { struct vgic_io_device *iodev = &its->iodev; int ret; mutex_lock(&kvm->slots_lock); if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { ret = -EBUSY; goto out; } its->vgic_its_base = addr; iodev->regions = its_registers; iodev->nr_regions = ARRAY_SIZE(its_registers); kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); iodev->base_addr = its->vgic_its_base; iodev->iodev_type = IODEV_ITS; iodev->its = its; ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, KVM_VGIC_V3_ITS_SIZE, &iodev->dev); out: mutex_unlock(&kvm->slots_lock); return ret; } #define INITIAL_BASER_VALUE \ (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ GITS_BASER_PAGE_SIZE_64K) #define INITIAL_PROPBASER_VALUE \ (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) static int vgic_its_create(struct kvm_device *dev, u32 type) { int ret; struct vgic_its *its; if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) return -ENODEV; its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL_ACCOUNT); if (!its) return -ENOMEM; mutex_lock(&dev->kvm->arch.config_lock); if (vgic_initialized(dev->kvm)) { ret = vgic_v4_init(dev->kvm); if (ret < 0) { mutex_unlock(&dev->kvm->arch.config_lock); kfree(its); return ret; } } mutex_init(&its->its_lock); mutex_init(&its->cmd_lock); /* Yep, even more trickery for lock ordering... */ #ifdef CONFIG_LOCKDEP mutex_lock(&its->cmd_lock); mutex_lock(&its->its_lock); mutex_unlock(&its->its_lock); mutex_unlock(&its->cmd_lock); #endif its->vgic_its_base = VGIC_ADDR_UNDEF; INIT_LIST_HEAD(&its->device_list); INIT_LIST_HEAD(&its->collection_list); xa_init(&its->translation_cache); dev->kvm->arch.vgic.msis_require_devid = true; dev->kvm->arch.vgic.has_its = true; its->enabled = false; its->dev = dev; its->baser_device_table = INITIAL_BASER_VALUE | ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); its->baser_coll_table = INITIAL_BASER_VALUE | ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; dev->private = its; ret = vgic_its_set_abi(its, NR_ITS_ABIS - 1); mutex_unlock(&dev->kvm->arch.config_lock); return ret; } static void vgic_its_destroy(struct kvm_device *kvm_dev) { struct kvm *kvm = kvm_dev->kvm; struct vgic_its *its = kvm_dev->private; mutex_lock(&its->its_lock); vgic_its_free_device_list(kvm, its); vgic_its_free_collection_list(kvm, its); vgic_its_invalidate_cache(its); xa_destroy(&its->translation_cache); mutex_unlock(&its->its_lock); kfree(its); kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */ } static int vgic_its_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { const struct vgic_register_region *region; gpa_t offset = attr->attr; int align; align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7; if (offset & align) return -EINVAL; region = vgic_find_mmio_region(its_registers, ARRAY_SIZE(its_registers), offset); if (!region) return -ENXIO; return 0; } static int vgic_its_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, u64 *reg, bool is_write) { const struct vgic_register_region *region; struct vgic_its *its; gpa_t addr, offset; unsigned int len; int align, ret = 0; its = dev->private; offset = attr->attr; /* * Although the spec supports upper/lower 32-bit accesses to * 64-bit ITS registers, the userspace ABI requires 64-bit * accesses to all 64-bit wide registers. We therefore only * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID * registers */ if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4)) align = 0x3; else align = 0x7; if (offset & align) return -EINVAL; mutex_lock(&dev->kvm->lock); if (!lock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { ret = -ENXIO; goto out; } region = vgic_find_mmio_region(its_registers, ARRAY_SIZE(its_registers), offset); if (!region) { ret = -ENXIO; goto out; } addr = its->vgic_its_base + offset; len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; if (is_write) { if (region->uaccess_its_write) ret = region->uaccess_its_write(dev->kvm, its, addr, len, *reg); else region->its_write(dev->kvm, its, addr, len, *reg); } else { *reg = region->its_read(dev->kvm, its, addr, len); } out: mutex_unlock(&dev->kvm->arch.config_lock); unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return ret; } static u32 compute_next_devid_offset(struct list_head *h, struct its_device *dev) { struct its_device *next; u32 next_offset; if (list_is_last(&dev->dev_list, h)) return 0; next = list_next_entry(dev, dev_list); next_offset = next->device_id - dev->device_id; return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET); } static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) { struct its_ite *next; u32 next_offset; if (list_is_last(&ite->ite_list, h)) return 0; next = list_next_entry(ite, ite_list); next_offset = next->event_id - ite->event_id; return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET); } /** * typedef entry_fn_t - Callback called on a table entry restore path * @its: its handle * @id: id of the entry * @entry: pointer to the entry * @opaque: pointer to an opaque data * * Return: < 0 on error, 0 if last element was identified, id offset to next * element otherwise */ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, void *opaque); /** * scan_its_table - Scan a contiguous table in guest RAM and applies a function * to each entry * * @its: its handle * @base: base gpa of the table * @size: size of the table in bytes * @esz: entry size in bytes * @start_id: the ID of the first entry in the table * (non zero for 2d level tables) * @fn: function to apply on each entry * @opaque: pointer to opaque data * * Return: < 0 on error, 0 if last element was identified, 1 otherwise * (the last element may not be found on second level tables) */ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, int start_id, entry_fn_t fn, void *opaque) { struct kvm *kvm = its->dev->kvm; unsigned long len = size; int id = start_id; gpa_t gpa = base; char entry[ESZ_MAX]; int ret; memset(entry, 0, esz); while (true) { int next_offset; size_t byte_offset; ret = kvm_read_guest_lock(kvm, gpa, entry, esz); if (ret) return ret; next_offset = fn(its, id, entry, opaque); if (next_offset <= 0) return next_offset; byte_offset = next_offset * esz; if (byte_offset >= len) break; id += next_offset; gpa += byte_offset; len -= byte_offset; } return 1; } /* * vgic_its_save_ite - Save an interrupt translation entry at @gpa */ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, struct its_ite *ite, gpa_t gpa) { u32 next_offset; u64 val; next_offset = compute_next_eventid_offset(&dev->itt_head, ite); val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); return vgic_its_write_entry_lock(its, gpa, val, ite); } /** * vgic_its_restore_ite - restore an interrupt translation entry * * @its: its handle * @event_id: id used for indexing * @ptr: pointer to the ITE entry * @opaque: pointer to the its_device */ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, void *ptr, void *opaque) { struct its_device *dev = opaque; struct its_collection *collection; struct kvm *kvm = its->dev->kvm; struct kvm_vcpu *vcpu = NULL; u64 val; u64 *p = (u64 *)ptr; struct vgic_irq *irq; u32 coll_id, lpi_id; struct its_ite *ite; u32 offset; val = *p; val = le64_to_cpu(val); coll_id = val & KVM_ITS_ITE_ICID_MASK; lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT; if (!lpi_id) return 1; /* invalid entry, no choice but to scan next entry */ if (lpi_id < VGIC_MIN_LPI) return -EINVAL; offset = val >> KVM_ITS_ITE_NEXT_SHIFT; if (event_id + offset >= BIT_ULL(dev->num_eventid_bits)) return -EINVAL; collection = find_collection(its, coll_id); if (!collection) return -EINVAL; if (!vgic_its_check_event_id(its, dev, event_id)) return -EINVAL; ite = vgic_its_alloc_ite(dev, collection, event_id); if (IS_ERR(ite)) return PTR_ERR(ite); if (its_is_collection_mapped(collection)) vcpu = kvm_get_vcpu_by_id(kvm, collection->target_addr); irq = vgic_add_lpi(kvm, lpi_id, vcpu); if (IS_ERR(irq)) { its_free_ite(kvm, ite); return PTR_ERR(irq); } ite->irq = irq; return offset; } static int vgic_its_ite_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct its_ite *itea = container_of(a, struct its_ite, ite_list); struct its_ite *iteb = container_of(b, struct its_ite, ite_list); if (itea->event_id < iteb->event_id) return -1; else return 1; } static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); gpa_t base = device->itt_addr; struct its_ite *ite; int ret; int ite_esz = abi->ite_esz; list_sort(NULL, &device->itt_head, vgic_its_ite_cmp); list_for_each_entry(ite, &device->itt_head, ite_list) { gpa_t gpa = base + ite->event_id * ite_esz; /* * If an LPI carries the HW bit, this means that this * interrupt is controlled by GICv4, and we do not * have direct access to that state without GICv4.1. * Let's simply fail the save operation... */ if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1) return -EACCES; ret = vgic_its_save_ite(its, device, ite, gpa); if (ret) return ret; } return 0; } /** * vgic_its_restore_itt - restore the ITT of a device * * @its: its handle * @dev: device handle * * Return 0 on success, < 0 on error */ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); gpa_t base = dev->itt_addr; int ret; int ite_esz = abi->ite_esz; size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz; ret = scan_its_table(its, base, max_size, ite_esz, 0, vgic_its_restore_ite, dev); /* scan_its_table returns +1 if all ITEs are invalid */ if (ret > 0) ret = 0; return ret; } /** * vgic_its_save_dte - Save a device table entry at a given GPA * * @its: ITS handle * @dev: ITS device * @ptr: GPA */ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, gpa_t ptr) { u64 val, itt_addr_field; u32 next_offset; itt_addr_field = dev->itt_addr >> 8; next_offset = compute_next_devid_offset(&its->device_list, dev); val = (1ULL << KVM_ITS_DTE_VALID_SHIFT | ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) | (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); return vgic_its_write_entry_lock(its, ptr, val, dte); } /** * vgic_its_restore_dte - restore a device table entry * * @its: its handle * @id: device id the DTE corresponds to * @ptr: kernel VA where the 8 byte DTE is located * @opaque: unused * * Return: < 0 on error, 0 if the dte is the last one, id offset to the * next dte otherwise */ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, void *ptr, void *opaque) { struct its_device *dev; u64 baser = its->baser_device_table; gpa_t itt_addr; u8 num_eventid_bits; u64 entry = *(u64 *)ptr; bool valid; u32 offset; int ret; entry = le64_to_cpu(entry); valid = entry >> KVM_ITS_DTE_VALID_SHIFT; num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1; itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK) >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8; if (!valid) return 1; /* dte entry is valid */ offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; if (!vgic_its_check_id(its, baser, id, NULL)) return -EINVAL; dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); if (IS_ERR(dev)) return PTR_ERR(dev); ret = vgic_its_restore_itt(its, dev); if (ret) { vgic_its_free_device(its->dev->kvm, its, dev); return ret; } return offset; } static int vgic_its_device_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct its_device *deva = container_of(a, struct its_device, dev_list); struct its_device *devb = container_of(b, struct its_device, dev_list); if (deva->device_id < devb->device_id) return -1; else return 1; } /* * vgic_its_save_device_tables - Save the device table and all ITT * into guest RAM * * L1/L2 handling is hidden by vgic_its_check_id() helper which directly * returns the GPA of the device entry */ static int vgic_its_save_device_tables(struct vgic_its *its) { u64 baser = its->baser_device_table; struct its_device *dev; if (!(baser & GITS_BASER_VALID)) return 0; list_sort(NULL, &its->device_list, vgic_its_device_cmp); list_for_each_entry(dev, &its->device_list, dev_list) { int ret; gpa_t eaddr; if (!vgic_its_check_id(its, baser, dev->device_id, &eaddr)) return -EINVAL; ret = vgic_its_save_itt(its, dev); if (ret) return ret; ret = vgic_its_save_dte(its, dev, eaddr); if (ret) return ret; } return 0; } /** * handle_l1_dte - callback used for L1 device table entries (2 stage case) * * @its: its handle * @id: index of the entry in the L1 table * @addr: kernel VA * @opaque: unused * * L1 table entries are scanned by steps of 1 entry * Return < 0 if error, 0 if last dte was found when scanning the L2 * table, +1 otherwise (meaning next L1 entry must be scanned) */ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, void *opaque) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); int l2_start_id = id * (SZ_64K / abi->dte_esz); u64 entry = *(u64 *)addr; int dte_esz = abi->dte_esz; gpa_t gpa; int ret; entry = le64_to_cpu(entry); if (!(entry & KVM_ITS_L1E_VALID_MASK)) return 1; gpa = entry & KVM_ITS_L1E_ADDR_MASK; ret = scan_its_table(its, gpa, SZ_64K, dte_esz, l2_start_id, vgic_its_restore_dte, NULL); return ret; } /* * vgic_its_restore_device_tables - Restore the device table and all ITT * from guest RAM to internal data structs */ static int vgic_its_restore_device_tables(struct vgic_its *its) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_device_table; int l1_esz, ret; int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; gpa_t l1_gpa; if (!(baser & GITS_BASER_VALID)) return 0; l1_gpa = GITS_BASER_ADDR_48_to_52(baser); if (baser & GITS_BASER_INDIRECT) { l1_esz = GITS_LVL1_ENTRY_SIZE; ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, handle_l1_dte, NULL); } else { l1_esz = abi->dte_esz; ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, vgic_its_restore_dte, NULL); } /* scan_its_table returns +1 if all entries are invalid */ if (ret > 0) ret = 0; if (ret < 0) vgic_its_free_device_list(its->dev->kvm, its); return ret; } static int vgic_its_save_cte(struct vgic_its *its, struct its_collection *collection, gpa_t gpa) { u64 val; val = (1ULL << KVM_ITS_CTE_VALID_SHIFT | ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); return vgic_its_write_entry_lock(its, gpa, val, cte); } /* * Restore a collection entry into the ITS collection table. * Return +1 on success, 0 if the entry was invalid (which should be * interpreted as end-of-table), and a negative error value for generic errors. */ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa) { struct its_collection *collection; struct kvm *kvm = its->dev->kvm; u32 target_addr, coll_id; u64 val; int ret; ret = vgic_its_read_entry_lock(its, gpa, &val, cte); if (ret) return ret; val = le64_to_cpu(val); if (!(val & KVM_ITS_CTE_VALID_MASK)) return 0; target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); coll_id = val & KVM_ITS_CTE_ICID_MASK; if (target_addr != COLLECTION_NOT_MAPPED && !kvm_get_vcpu_by_id(kvm, target_addr)) return -EINVAL; collection = find_collection(its, coll_id); if (collection) return -EEXIST; if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) return -EINVAL; ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; collection->target_addr = target_addr; return 1; } /* * vgic_its_save_collection_table - Save the collection table into * guest RAM */ static int vgic_its_save_collection_table(struct vgic_its *its) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_coll_table; gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; size_t max_size, filled = 0; int ret, cte_esz = abi->cte_esz; if (!(baser & GITS_BASER_VALID)) return 0; max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; list_for_each_entry(collection, &its->collection_list, coll_list) { ret = vgic_its_save_cte(its, collection, gpa); if (ret) return ret; gpa += cte_esz; filled += cte_esz; } if (filled == max_size) return 0; /* * table is not fully filled, add a last dummy element * with valid bit unset */ return vgic_its_write_entry_lock(its, gpa, 0ULL, cte); } /* * vgic_its_restore_collection_table - reads the collection table * in guest memory and restores the ITS internal state. Requires the * BASER registers to be restored before. */ static int vgic_its_restore_collection_table(struct vgic_its *its) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_coll_table; int cte_esz = abi->cte_esz; size_t max_size, read = 0; gpa_t gpa; int ret; if (!(baser & GITS_BASER_VALID)) return 0; gpa = GITS_BASER_ADDR_48_to_52(baser); max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; while (read < max_size) { ret = vgic_its_restore_cte(its, gpa); if (ret <= 0) break; gpa += cte_esz; read += cte_esz; } if (ret > 0) return 0; if (ret < 0) vgic_its_free_collection_list(its->dev->kvm, its); return ret; } /* * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM * according to v0 ABI */ static int vgic_its_save_tables_v0(struct vgic_its *its) { int ret; ret = vgic_its_save_device_tables(its); if (ret) return ret; return vgic_its_save_collection_table(its); } /* * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM * to internal data structs according to V0 ABI * */ static int vgic_its_restore_tables_v0(struct vgic_its *its) { int ret; ret = vgic_its_restore_collection_table(its); if (ret) return ret; ret = vgic_its_restore_device_tables(its); if (ret) vgic_its_free_collection_list(its->dev->kvm, its); return ret; } static int vgic_its_commit_v0(struct vgic_its *its) { const struct vgic_its_abi *abi; abi = vgic_its_get_abi(its); its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK; its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK; its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5) << GITS_BASER_ENTRY_SIZE_SHIFT); its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) << GITS_BASER_ENTRY_SIZE_SHIFT); return 0; } static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) { /* We need to keep the ABI specific field values */ its->baser_coll_table &= ~GITS_BASER_VALID; its->baser_device_table &= ~GITS_BASER_VALID; its->cbaser = 0; its->creadr = 0; its->cwriter = 0; its->enabled = 0; vgic_its_free_device_list(kvm, its); vgic_its_free_collection_list(kvm, its); } static int vgic_its_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: switch (attr->attr) { case KVM_VGIC_ITS_ADDR_TYPE: return 0; } break; case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: return 0; case KVM_DEV_ARM_ITS_CTRL_RESET: return 0; case KVM_DEV_ARM_ITS_SAVE_TABLES: return 0; case KVM_DEV_ARM_ITS_RESTORE_TABLES: return 0; } break; case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: return vgic_its_has_attr_regs(dev, attr); } return -ENXIO; } static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ return 0; mutex_lock(&kvm->lock); if (!lock_all_vcpus(kvm)) { mutex_unlock(&kvm->lock); return -EBUSY; } mutex_lock(&kvm->arch.config_lock); mutex_lock(&its->its_lock); switch (attr) { case KVM_DEV_ARM_ITS_CTRL_RESET: vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: ret = abi->save_tables(its); break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); break; } mutex_unlock(&its->its_lock); mutex_unlock(&kvm->arch.config_lock); unlock_all_vcpus(kvm); mutex_unlock(&kvm->lock); return ret; } /* * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory * without the running VCPU when dirty ring is enabled. * * The running VCPU is required to track dirty guest pages when dirty ring * is enabled. Otherwise, the backup bitmap should be used to track the * dirty guest pages. When vgic/its tables are being saved, the backup * bitmap is used to track the dirty guest pages due to the missed running * VCPU in the period. */ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; return dist->table_write_in_progress; } static int vgic_its_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { struct vgic_its *its = dev->private; int ret; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: { u64 __user *uaddr = (u64 __user *)(long)attr->addr; unsigned long type = (unsigned long)attr->attr; u64 addr; if (type != KVM_VGIC_ITS_ADDR_TYPE) return -ENODEV; if (copy_from_user(&addr, uaddr, sizeof(addr))) return -EFAULT; ret = vgic_check_iorange(dev->kvm, its->vgic_its_base, addr, SZ_64K, KVM_VGIC_V3_ITS_SIZE); if (ret) return ret; return vgic_register_its_iodev(dev->kvm, its, addr); } case KVM_DEV_ARM_VGIC_GRP_CTRL: return vgic_its_ctrl(dev->kvm, its, attr->attr); case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { u64 __user *uaddr = (u64 __user *)(long)attr->addr; u64 reg; if (get_user(reg, uaddr)) return -EFAULT; return vgic_its_attr_regs_access(dev, attr, &reg, true); } } return -ENXIO; } static int vgic_its_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: { struct vgic_its *its = dev->private; u64 addr = its->vgic_its_base; u64 __user *uaddr = (u64 __user *)(long)attr->addr; unsigned long type = (unsigned long)attr->attr; if (type != KVM_VGIC_ITS_ADDR_TYPE) return -ENODEV; if (copy_to_user(uaddr, &addr, sizeof(addr))) return -EFAULT; break; } case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { u64 __user *uaddr = (u64 __user *)(long)attr->addr; u64 reg; int ret; ret = vgic_its_attr_regs_access(dev, attr, &reg, false); if (ret) return ret; return put_user(reg, uaddr); } default: return -ENXIO; } return 0; } static struct kvm_device_ops kvm_arm_vgic_its_ops = { .name = "kvm-arm-vgic-its", .create = vgic_its_create, .destroy = vgic_its_destroy, .set_attr = vgic_its_set_attr, .get_attr = vgic_its_get_attr, .has_attr = vgic_its_has_attr, }; int kvm_vgic_register_its_device(void) { return kvm_register_device_ops(&kvm_arm_vgic_its_ops, KVM_DEV_TYPE_ARM_VGIC_ITS); }
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/platform_data/dsa.h> #include <linux/phylink.h> #include <net/devlink.h> #include <net/switchdev.h> struct dsa_8021q_context; struct tc_action; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 #define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 #define DSA_TAG_PROTO_DSA_VALUE 3 #define DSA_TAG_PROTO_EDSA_VALUE 4 #define DSA_TAG_PROTO_GSWIP_VALUE 5 #define DSA_TAG_PROTO_KSZ9477_VALUE 6 #define DSA_TAG_PROTO_KSZ9893_VALUE 7 #define DSA_TAG_PROTO_LAN9303_VALUE 8 #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 #define DSA_TAG_PROTO_RTL8_4T_VALUE 25 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, }; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); int (*connect)(struct dsa_switch *ds); void (*disconnect)(struct dsa_switch *ds); unsigned int needed_headroom; unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ bool promisc_on_conduit; }; struct dsa_lag { struct net_device *dev; unsigned int id; struct mutex fdb_lock; struct list_head fdbs; refcount_t refcount; }; struct dsa_switch_tree { struct list_head list; /* List of switch ports */ struct list_head ports; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; /* Default tagging protocol preferred by the switches in this * tree. */ enum dsa_tag_protocol default_proto; /* Has this tree been applied to the hardware? */ bool setup; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* List of DSA links composing the routing table */ struct list_head rtable; /* Length of "lags" array */ unsigned int lags_len; /* Track the largest switch index within a tree */ unsigned int last_switch; }; /* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { struct dsa_lag *lag = dsa_lag_by_id(dst, id); if (lag->dev == lag_dev) return lag->id; } return -ENODEV; } /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, DSA_PORT_MALL_POLICER, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC port policer entry */ struct dsa_mall_policer_tc_entry { u32 burst; u64 rate_bytes_per_sec; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; struct dsa_mall_policer_tc_entry policer; }; }; struct dsa_bridge { struct net_device *dev; unsigned int num; bool tx_fwd_offload; refcount_t refcount; }; struct dsa_port { /* A CPU port is physically connected to a conduit device. A user port * exposes a network device to user-space, called 'user' here. */ union { struct net_device *conduit; struct net_device *user; }; /* Copy of the tagging protocol operations, for quicker access * in the data path. Valid only for the CPU ports. */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); struct dsa_switch *ds; unsigned int index; enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU, DSA_PORT_TYPE_DSA, DSA_PORT_TYPE_USER, } type; const char *name; struct dsa_port *cpu_dp; u8 mac[ETH_ALEN]; u8 stp_state; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u8 vlan_filtering:1; /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ u8 learning:1; u8 lag_tx_enabled:1; /* conduit state bits, valid only on CPU ports */ u8 conduit_admin_up:1; u8 conduit_oper_up:1; /* Valid only on user ports */ u8 cpu_port_in_lag:1; u8 setup:1; struct device_node *dn; unsigned int ageing_time; struct dsa_bridge *bridge; struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; /* * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; struct mutex vlans_lock; union { /* List of VLANs that CPU and DSA ports are members of. * Access to this is serialized by the sleepable @vlans_lock. */ struct list_head vlans; /* List of VLANs that user ports are members of. * Access to this is serialized by netif_addr_lock_bh(). */ struct list_head user_vlans; }; }; static inline struct dsa_port * dsa_phylink_to_port(struct phylink_config *config) { return container_of(config, struct dsa_port, pl_config); } /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ struct dsa_link { struct dsa_port *dp; struct dsa_port *link_dp; struct list_head list; }; enum dsa_db_type { DSA_DB_PORT, DSA_DB_LAG, DSA_DB_BRIDGE, }; struct dsa_db { enum dsa_db_type type; union { const struct dsa_port *dp; struct dsa_lag lag; struct dsa_bridge bridge; }; }; struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; struct dsa_db db; }; struct dsa_vlan { u16 vid; refcount_t refcount; struct list_head list; }; struct dsa_switch { struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; unsigned int index; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u32 setup:1; /* Disallow bridge core from requesting different VLAN awareness * settings on ports if not hardware-supported */ u32 vlan_filtering_is_global:1; /* Keep VLAN filtering enabled on ports not offloading any upper */ u32 needs_standalone_vlan_filtering:1; /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges * that have vlan_filtering=0. All drivers should ideally set this (and * then the option would get removed), but it is unknown whether this * would break things or not. */ u32 configure_vlan_while_not_filtering:1; /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. * DEPRECATED: Do NOT set this field in new drivers. Instead look at * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. * Useful if the switch cannot preserve the VLAN tag as seen on the * wire for user port ingress, and chooses to send all frames as * VLAN-tagged to the CPU, including those which were originally * untagged. */ u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. */ u32 assisted_learning_on_cpu_port:1; /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ u32 vlan_filtering:1; /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. */ u32 mtu_enforcement_ingress:1; /* Drivers that isolate the FDBs of multiple bridges must set this * to true to receive the bridge as an argument in .port_fdb_{add,del} * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be * passed as zero. */ u32 fdb_isolation:1; /* Drivers that have global DSCP mapping settings must set this to * true to automatically apply the settings to all ports. */ u32 dscp_prio_mapping_is_global:1; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; void *tagger_data; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * Allow a DSA switch driver to override the phylink MAC ops */ const struct phylink_mac_ops *phylink_mac_ops; /* * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* Storage for drivers using tag_8021q */ struct dsa_8021q_context *tag_8021q_ctx; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Drivers that benefit from having an ID associated with each * offloaded LAG should set this to the maximum number of * supported IDs. DSA will then maintain a mapping of _at * least_ these many IDs, accessible to drivers via * dsa_lag_id(). */ unsigned int num_lag_ids; /* Drivers that support bridge forwarding offload or FDB isolation * should set this to the maximum number of bridges spanning the same * switch tree (or all trees, in the case of cross-tree bridging * support) that can be offloaded. */ unsigned int max_num_bridges; unsigned int num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) return dp; return NULL; } static inline bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_port_is_cpu(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_port_is_user(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_USER; } static inline bool dsa_port_is_unused(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) { return dsa_port_is_cpu(dp) && dp->conduit_admin_up && dp->conduit_oper_up; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } #define dsa_tree_for_each_user_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_cpu_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_available_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (!dsa_port_is_unused((_dp))) #define dsa_switch_for_each_user_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) static inline u32 dsa_user_ports(struct dsa_switch *ds) { struct dsa_port *dp; u32 mask = 0; dsa_switch_for_each_user_port(dp, ds) mask |= BIT(dp->index); return mask; } static inline u32 dsa_cpu_ports(struct dsa_switch *ds) { struct dsa_port *cpu_dp; u32 mask = 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) mask |= BIT(cpu_dp->index); return mask; } /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { struct dsa_switch_tree *dst = ds->dst; struct dsa_link *dl; list_for_each_entry(dl, &dst->rtable, list) if (dl->dp->ds == ds && dl->link_dp->ds->index == device) return dl->dp->index; return ds->num_ports; } /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) { if (device == ds->index) return port; else return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { const struct dsa_port *dp = dsa_to_port(ds, port); const struct dsa_port *cpu_dp = dp->cpu_dp; if (!cpu_dp) return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } /* Return true if this is the local port used to reach the CPU port */ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) { if (dsa_is_unused_port(ds, port)) return false; return port == dsa_upstream_port(ds, port); } /* Return true if this is a DSA port leading away from the CPU */ static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) { return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); } /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { struct dsa_port *dp; dsa_switch_for_each_available_port(dp, ds) { return dsa_upstream_port(ds, dp->index); } return ds->num_ports; } /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. */ static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, struct dsa_switch *downstream_ds) { int routing_port; if (upstream_ds == downstream_ds) return true; routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); return dsa_is_upstream_port(downstream_ds, routing_port); } static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; if (ds->vlan_filtering_is_global) return ds->vlan_filtering; else return dp->vlan_filtering; } static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) { return dp->lag ? dp->lag->id : 0; } static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) { return dp->lag ? dp->lag->dev : NULL; } static inline bool dsa_port_offloads_lag(struct dsa_port *dp, const struct dsa_lag *lag) { return dsa_port_lag_dev_get(dp) == lag->dev; } static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) { if (dp->cpu_port_in_lag) return dsa_port_lag_dev_get(dp->cpu_dp); return dp->cpu_dp->conduit; } static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; if (dp->lag) return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; return dp->user; } static inline struct net_device * dsa_port_bridge_dev_get(const struct dsa_port *dp) { return dp->bridge ? dp->bridge->dev : NULL; } static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp) { return dp->bridge ? dp->bridge->num : 0; } static inline bool dsa_port_bridge_same(const struct dsa_port *a, const struct dsa_port *b) { struct net_device *br_a = dsa_port_bridge_dev_get(a); struct net_device *br_b = dsa_port_bridge_dev_get(b); /* Standalone ports are not in the same bridge with one another */ return (!br_a || !br_b) ? false : (br_a == br_b); } static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge_dev(struct dsa_port *dp, const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. */ return dsa_port_bridge_dev_get(dp) == bridge_dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, const struct dsa_bridge *bridge) { return dsa_port_bridge_dev_get(dp) == bridge->dev; } /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, const struct net_device *dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_port(dp, dev)) return true; return false; } /* Returns true if any port of this tree offloads the given bridge */ static inline bool dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, const struct net_device *bridge_dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_dev(dp, bridge_dev)) return true; return false; } static inline bool dsa_port_tree_same(const struct dsa_port *a, const struct dsa_port *b) { return a->ds->dst == b->ds->dst; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Tagging protocol helpers called for the CPU ports and DSA links. * @get_tag_protocol retrieves the initial tagging protocol and is * mandatory. Switches which can operate using multiple tagging * protocols should implement @change_tag_protocol and report in * @get_tag_protocol the tagger in current use. */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); int (*change_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); /* * Method for switch drivers to connect to the tagging protocol driver * in current use. The switch driver can provide handlers for certain * types of packets for switch management. */ int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); int (*port_change_conduit)(struct dsa_switch *ds, int port, struct net_device *conduit, struct netlink_ext_ack *extack); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); /* Per-port initialization and destruction methods. Mandatory if the * driver registers devlink port regions, optional otherwise. */ int (*port_setup)(struct dsa_switch *ds, int port); void (*port_teardown)(struct dsa_switch *ds, int port); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); void (*get_eth_phy_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); void (*get_ts_stats)(struct dsa_switch *ds, int port, struct ethtool_ts_stats *ts_stats); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, struct ethtool_pause_stats *pause_stats); void (*self_test)(struct dsa_switch *ds, int port, struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_info *ts); /* * ethtool MAC merge layer */ int (*get_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_state *state); int (*set_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct dsa_switch *ds, int port, struct ethtool_mm_stats *stats); /* * DCB ops */ int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_set_apptrust)(struct dsa_switch *ds, int port, const u8 *sel, int nsel); int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, int *nsel); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); /* * Notification for MAC address changes on user ports. Drivers can * currently only veto operations. They should not use the method to * program the hardware, since the operation is not rolled back in case * of other errors. */ int (*port_set_mac_address)(struct dsa_switch *ds, int port, const unsigned char *addr); /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest * CPU port of a switch for its local ports. This can return NULL, * meaning "don't know/don't care". */ struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); /* * Port's MAC EEE settings */ bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Upper device tracking. */ int (*port_prechangeupper)(struct dsa_switch *ds, int port, struct netdev_notifier_changeupper_info *info); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, bool *tx_fwd_offload, struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); int (*port_mst_state_set)(struct dsa_switch *ds, int port, const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); void (*port_set_host_flood)(struct dsa_switch *ds, int port, bool uc, bool mc); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, const struct switchdev_vlan_msti *msti); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*cls_flower_add)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_del)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_stats)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, struct dsa_mall_policer_tc_entry *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge, struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, struct ifreq *ifr); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct ifreq *ifr); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); int (*devlink_sb_pool_get)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack); int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 *p_pool_index, u32 *p_threshold); int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_cur, u32 *p_max); int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through * this method. By MTU, one understands the SDU (L2 payload) length. * If the switch needs to account for the DSA tag on the CPU port, this * method needs to do so privately. */ int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); /* * LAG integration */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct dsa_lag lag); /* * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); /* * MRP integration */ int (*port_mrp_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); /* * tag_8021q operations */ int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); /* * DSA conduit tracking operations */ void (*conduit_state_change)(struct dsa_switch *ds, const struct net_device *conduit, bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); int dsa_devlink_resource_register(struct dsa_switch *ds, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); void dsa_devlink_resources_unregister(struct dsa_switch *ds); void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); struct devlink_region * dsa_devlink_port_region_create(struct dsa_switch *ds, int port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; }; static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) { struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) { struct devlink *dl = port->devlink; struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline int dsa_devlink_port_to_port(struct devlink_port *port) { return port->index; } struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; }; bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. * * We need to return: * - offset: the (B - A) difference between: * A. the position of the real EtherType and * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes * after the normal EtherType was supposed to be) * The offset in bytes is exactly equal to the tagger overhead (and half of * that, in __be16 shorts). * * - proto: the value of the real EtherType. */ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) bool dsa_user_dev_check(const struct net_device *dev); #else static inline bool dsa_user_dev_check(const struct net_device *dev) { return false; } #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif
23 23 23 23 23 23 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/smpboot.h> #include <asm/processor.h> #include <linux/kasan.h> #include <trace/events/ipi.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); static DEFINE_PER_CPU(struct task_struct *, irq_workd); static void wake_irq_workd(void) { struct task_struct *tsk = __this_cpu_read(irq_workd); if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) wake_up_process(tsk); } #ifdef CONFIG_SMP static void irq_work_wake(struct irq_work *entry) { wake_irq_workd(); } static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = IRQ_WORK_INIT_HARD(irq_work_wake); #endif static int irq_workd_should_run(unsigned int cpu) { return !llist_empty(this_cpu_ptr(&lazy_list)); } /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { int oflags; oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) return false; return true; } void __weak arch_irq_work_raise(void) { /* * Lame architectures will get the timer tick callback */ } static __always_inline void irq_work_raise(struct irq_work *work) { if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { struct llist_head *list; bool rt_lazy_work = false; bool lazy_work = false; int work_flags; work_flags = atomic_read(&work->node.a_flags); if (work_flags & IRQ_WORK_LAZY) lazy_work = true; else if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(work_flags & IRQ_WORK_HARD_IRQ)) rt_lazy_work = true; if (lazy_work || rt_lazy_work) list = this_cpu_ptr(&lazy_list); else list = this_cpu_ptr(&raised_list); if (!llist_add(&work->node.llist, list)) return; /* If the work is "lazy", handle it from next tick if any */ if (!lazy_work || tick_nohz_tick_stopped()) irq_work_raise(work); } /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); preempt_enable(); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ bool irq_work_queue_on(struct irq_work *work, int cpu) { #ifndef CONFIG_SMP return irq_work_queue(work); #else /* CONFIG_SMP: */ /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); /* * On PREEMPT_RT the items which are not marked as * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work * item is used on the remote CPU to wake the thread. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) goto out; work = &per_cpu(irq_work_wakeup, cpu); if (!irq_work_claim(work)) goto out; } __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); if (llist_empty(raised) || arch_irq_work_has_interrupt()) if (llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); return true; } void irq_work_single(void *arg) { struct irq_work *work = arg; int flags; /* * Clear the PENDING bit, after this point the @work can be re-used. * The PENDING bit acts as a lock, and we own it, so we can clear it * without atomic ops. */ flags = atomic_read(&work->node.a_flags); flags &= ~IRQ_WORK_PENDING; atomic_set(&work->node.a_flags, flags); /* * See irq_work_claim(). */ smp_mb(); lockdep_irq_work_enter(flags); work->func(work); lockdep_irq_work_exit(flags); /* * Clear the BUSY bit, if set, and return to the free state if no-one * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; /* * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed * in a per-CPU thread in preemptible context. Only the items which are * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. */ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } /* * hotplug calls this through: * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); void irq_work_tick(void) { struct llist_head *raised = this_cpu_ptr(&raised_list); if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); might_sleep(); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { irq_work_run_list(this_cpu_ptr(&lazy_list)); } static void irq_workd_setup(unsigned int cpu) { sched_set_fifo_low(current); } static struct smp_hotplug_thread irqwork_threads = { .store = &irq_workd, .setup = irq_workd_setup, .thread_should_run = irq_workd_should_run, .thread_fn = run_irq_workd, .thread_comm = "irq_work/%u", }; static __init int irq_work_init_threads(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); return 0; } early_initcall(irq_work_init_threads);
8 1 7 8 1 8 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 - Linaro and Columbia University * Author: Jintack Lim <jintack.lim@linaro.org> */ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_nested.h> #include "hyp/include/hyp/adjust_pc.h" #include "trace.h" enum trap_behaviour { BEHAVE_HANDLE_LOCALLY = 0, BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, /* Traps that take effect in Host EL0, this is rare! */ BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), }; struct trap_bits { const enum vcpu_sysreg index; const enum trap_behaviour behaviour; const u64 value; const u64 mask; }; /* Coarse Grained Trap definitions */ enum cgt_group_id { /* Indicates no coarse trap control */ __RESERVED__, /* * The first batch of IDs denote coarse trapping that are used * on their own instead of being part of a combination of * trap controls. */ CGT_HCR_TID1, CGT_HCR_TID2, CGT_HCR_TID3, CGT_HCR_IMO, CGT_HCR_FMO, CGT_HCR_TIDCP, CGT_HCR_TACR, CGT_HCR_TSW, CGT_HCR_TPC, CGT_HCR_TPU, CGT_HCR_TTLB, CGT_HCR_TVM, CGT_HCR_TDZ, CGT_HCR_TRVM, CGT_HCR_TLOR, CGT_HCR_TERR, CGT_HCR_APK, CGT_HCR_NV, CGT_HCR_NV_nNV2, CGT_HCR_NV1_nNV2, CGT_HCR_AT, CGT_HCR_nFIEN, CGT_HCR_TID4, CGT_HCR_TICAB, CGT_HCR_TOCU, CGT_HCR_ENSCXT, CGT_HCR_TTLBIS, CGT_HCR_TTLBOS, CGT_MDCR_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TDE, CGT_MDCR_TDA, CGT_MDCR_TDOSA, CGT_MDCR_TDRA, CGT_MDCR_E2PB, CGT_MDCR_TPMS, CGT_MDCR_TTRF, CGT_MDCR_E2TB, CGT_MDCR_TDCC, CGT_CPTR_TAM, CGT_CPTR_TCPAC, CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, CGT_CNTHCTL_EL1TVT, CGT_CNTHCTL_EL1TVCT, CGT_ICH_HCR_TC, CGT_ICH_HCR_TALL0, CGT_ICH_HCR_TALL1, CGT_ICH_HCR_TDIR, /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, CGT_HCR_TVM_TRVM, CGT_HCR_TVM_TRVM_HCRX_TCR2En, CGT_HCR_TPU_TICAB, CGT_HCR_TPU_TOCU, CGT_HCR_NV1_nNV2_ENSCXT, CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM_HPMN, CGT_MDCR_TDE_TDA, CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, CGT_ICH_HCR_TC_TDIR, /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. */ __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PTEN, CGT_CNTHCTL_EL1NVPCT, CGT_CNTHCTL_EL1NVVCT, CGT_CPTR_TTA, CGT_MDCR_HPMN, /* Must be last */ __NR_CGT_GROUP_IDS__ }; static const struct trap_bits coarse_trap_bits[] = { [CGT_HCR_TID1] = { .index = HCR_EL2, .value = HCR_TID1, .mask = HCR_TID1, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_HCR_TID2] = { .index = HCR_EL2, .value = HCR_TID2, .mask = HCR_TID2, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID3] = { .index = HCR_EL2, .value = HCR_TID3, .mask = HCR_TID3, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_HCR_IMO] = { .index = HCR_EL2, .value = HCR_IMO, .mask = HCR_IMO, .behaviour = BEHAVE_FORWARD_WRITE, }, [CGT_HCR_FMO] = { .index = HCR_EL2, .value = HCR_FMO, .mask = HCR_FMO, .behaviour = BEHAVE_FORWARD_WRITE, }, [CGT_HCR_TIDCP] = { .index = HCR_EL2, .value = HCR_TIDCP, .mask = HCR_TIDCP, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TACR] = { .index = HCR_EL2, .value = HCR_TACR, .mask = HCR_TACR, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TSW] = { .index = HCR_EL2, .value = HCR_TSW, .mask = HCR_TSW, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ .index = HCR_EL2, .value = HCR_TPC, .mask = HCR_TPC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPU] = { .index = HCR_EL2, .value = HCR_TPU, .mask = HCR_TPU, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLB] = { .index = HCR_EL2, .value = HCR_TTLB, .mask = HCR_TTLB, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TVM] = { .index = HCR_EL2, .value = HCR_TVM, .mask = HCR_TVM, .behaviour = BEHAVE_FORWARD_WRITE, }, [CGT_HCR_TDZ] = { .index = HCR_EL2, .value = HCR_TDZ, .mask = HCR_TDZ, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TRVM] = { .index = HCR_EL2, .value = HCR_TRVM, .mask = HCR_TRVM, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_HCR_TLOR] = { .index = HCR_EL2, .value = HCR_TLOR, .mask = HCR_TLOR, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TERR] = { .index = HCR_EL2, .value = HCR_TERR, .mask = HCR_TERR, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_APK] = { .index = HCR_EL2, .value = 0, .mask = HCR_APK, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV_nNV2] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV | HCR_NV2, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV1_nNV2] = { .index = HCR_EL2, .value = HCR_NV | HCR_NV1, .mask = HCR_NV | HCR_NV1 | HCR_NV2, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_AT] = { .index = HCR_EL2, .value = HCR_AT, .mask = HCR_AT, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_nFIEN] = { .index = HCR_EL2, .value = 0, .mask = HCR_FIEN, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID4] = { .index = HCR_EL2, .value = HCR_TID4, .mask = HCR_TID4, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TICAB] = { .index = HCR_EL2, .value = HCR_TICAB, .mask = HCR_TICAB, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TOCU] = { .index = HCR_EL2, .value = HCR_TOCU, .mask = HCR_TOCU, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_ENSCXT] = { .index = HCR_EL2, .value = 0, .mask = HCR_ENSCXT, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBIS] = { .index = HCR_EL2, .value = HCR_TTLBIS, .mask = HCR_TTLBIS, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBOS] = { .index = HCR_EL2, .value = HCR_TTLBOS, .mask = HCR_TTLBOS, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, .behaviour = BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, .behaviour = BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, .value = MDCR_EL2_TDE, .mask = MDCR_EL2_TDE, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDA, .mask = MDCR_EL2_TDA, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDOSA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDOSA, .mask = MDCR_EL2_TDOSA, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDRA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDRA, .mask = MDCR_EL2_TDRA, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2PB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2PB_SHIFT), .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMS] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMS, .mask = MDCR_EL2_TPMS, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TTRF] = { .index = MDCR_EL2, .value = MDCR_EL2_TTRF, .mask = MDCR_EL2_TTRF, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2TB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2TB_SHIFT), .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDCC] = { .index = MDCR_EL2, .value = MDCR_EL2_TDCC, .mask = MDCR_EL2_TDCC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TAM] = { .index = CPTR_EL2, .value = CPTR_EL2_TAM, .mask = CPTR_EL2_TAM, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TCPAC] = { .index = CPTR_EL2, .value = CPTR_EL2_TCPAC, .mask = CPTR_EL2_TCPAC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_EnFPM] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_EnFPM, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CNTHCTL_EL1TVT] = { .index = CNTHCTL_EL2, .value = CNTHCTL_EL1TVT, .mask = CNTHCTL_EL1TVT, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CNTHCTL_EL1TVCT] = { .index = CNTHCTL_EL2, .value = CNTHCTL_EL1TVCT, .mask = CNTHCTL_EL1TVCT, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TC, .mask = ICH_HCR_TC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL0] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TALL0, .mask = ICH_HCR_TALL0, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL1] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TALL1, .mask = ICH_HCR_TALL1, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TDIR] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TDIR, .mask = ICH_HCR_TDIR, .behaviour = BEHAVE_FORWARD_RW, }, }; #define MCB(id, ...) \ [id - __MULTIPLE_CONTROL_BITS__] = \ (const enum cgt_group_id[]){ \ __VA_ARGS__, __RESERVED__ \ } static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM), MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En, CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En), MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB), MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); /* * Warning, maximum confusion ahead. * * When E2H=0, CNTHCTL_EL2[1:0] are defined as EL1PCEN:EL1PCTEN * When E2H=1, CNTHCTL_EL2[11:10] are defined as EL1PTEN:EL1PCTEN * * Note the single letter difference? Yet, the bits have the same * function despite a different layout and a different name. * * We don't try to reconcile this mess. We just use the E2H=0 bits * to generate something that is in the E2H=1 format, and live with * it. You're welcome. */ static u64 get_sanitized_cnthctl(struct kvm_vcpu *vcpu) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; return val & ((CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN) << 10); } static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) { if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) { if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) { u64 val; val = __vcpu_sys_reg(vcpu, HCR_EL2); return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); } static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) { if (!is_nested_nv2_guest(vcpu) || !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) { if (!is_nested_nv2_guest(vcpu) || !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) { u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) val = translate_cptr_el2_to_cpacr_el1(val); if (val & CPACR_EL1_TTA) return BEHAVE_FORWARD_RW; return BEHAVE_HANDLE_LOCALLY; } static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) { u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); unsigned int idx; switch (sysreg) { case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); break; case SYS_PMXEVTYPER_EL0: case SYS_PMXEVCNTR_EL0: idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); break; default: /* Someone used this trap helper for something else... */ KVM_BUG_ON(1, vcpu->kvm); return BEHAVE_HANDLE_LOCALLY; } if (kvm_pmu_counter_is_hyp(vcpu, idx)) return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; return BEHAVE_HANDLE_LOCALLY; } #define CCC(id, fn) \ [id - __COMPLEX_CONDITIONS__] = fn static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), CCC(CGT_CPTR_TTA, check_cptr_tta), CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; /* * Bit assignment for the trap controls. We use a 64bit word with the * following layout for each trapped sysreg: * * [9:0] enum cgt_group_id (10 bits) * [13:10] enum fgt_group_id (4 bits) * [19:14] bit number in the FGT register (6 bits) * [20] trap polarity (1 bit) * [25:21] FG filter (5 bits) * [35:26] Main SysReg table index (10 bits) * [62:36] Unused (27 bits) * [63] RES0 - Must be zero, as lost on insertion in the xarray */ #define TC_CGT_BITS 10 #define TC_FGT_BITS 4 #define TC_FGF_BITS 5 #define TC_SRI_BITS 10 union trap_config { u64 val; struct { unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ unsigned long fgt:TC_FGT_BITS; /* Fine Grained Trap id */ unsigned long bit:6; /* Bit number */ unsigned long pol:1; /* Polarity */ unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */ unsigned long sri:TC_SRI_BITS; /* SysReg Index */ unsigned long unused:27; /* Unused, should be zero */ unsigned long mbz:1; /* Must Be Zero */ }; }; struct encoding_to_trap_config { const u32 encoding; const u32 end; const union trap_config tc; const unsigned int line; }; #define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ { \ .encoding = sr_start, \ .end = sr_end, \ .tc = { \ .cgt = trap_id, \ }, \ .line = __LINE__, \ } #define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) /* * Map encoding to trap bits for exception reported with EC=0x18. * These must only be evaluated when running a nested hypervisor, but * that the current context is not a hypervisor context. When the * trapped access matches one of the trap controls, the exception is * re-injected in the nested hypervisor. */ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_REVIDR_EL1, CGT_HCR_TID1), SR_TRAP(SYS_AIDR_EL1, CGT_HCR_TID1), SR_TRAP(SYS_SMIDR_EL1, CGT_HCR_TID1), SR_TRAP(SYS_CTR_EL0, CGT_HCR_TID2), SR_TRAP(SYS_CCSIDR_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CCSIDR2_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CLIDR_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), sys_reg(3, 1, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 2, 11, 0, 0), sys_reg(3, 2, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 3, 11, 0, 0), sys_reg(3, 3, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 4, 11, 0, 0), sys_reg(3, 4, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 5, 11, 0, 0), sys_reg(3, 5, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 6, 11, 0, 0), sys_reg(3, 6, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 7, 11, 0, 0), sys_reg(3, 7, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 0, 15, 0, 0), sys_reg(3, 0, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 15, 0, 0), sys_reg(3, 1, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 2, 15, 0, 0), sys_reg(3, 2, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 3, 15, 0, 0), sys_reg(3, 3, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 4, 15, 0, 0), sys_reg(3, 4, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 5, 15, 0, 0), sys_reg(3, 5, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 6, 15, 0, 0), sys_reg(3, 6, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 7, 15, 0, 0), sys_reg(3, 7, 15, 15, 7), CGT_HCR_TIDCP), SR_TRAP(SYS_ACTLR_EL1, CGT_HCR_TACR), SR_TRAP(SYS_DC_ISW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CISW, CGT_HCR_TSW), SR_TRAP(SYS_DC_IGSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_IGDSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CGSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CGDSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CIGSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CIGDSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CIVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CVAP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CVADP, CGT_HCR_TPC), SR_TRAP(SYS_DC_IVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CIGVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CIGDVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_IGVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_IGDVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGDVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGVAP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGDVAP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGVADP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGDVADP, CGT_HCR_TPC), SR_TRAP(SYS_IC_IVAU, CGT_HCR_TPU_TOCU), SR_TRAP(SYS_IC_IALLU, CGT_HCR_TPU_TOCU), SR_TRAP(SYS_IC_IALLUIS, CGT_HCR_TPU_TICAB), SR_TRAP(SYS_DC_CVAU, CGT_HCR_TPU_TOCU), SR_TRAP(OP_TLBI_RVAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VMALLE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_ASIDE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VMALLE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_ASIDE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VMALLE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_ASIDE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VMALLE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_ASIDE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VMALLE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_ASIDE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VMALLE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_ASIDE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_ESR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_FAR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AFSR0_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AFSR1_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ), SR_TRAP(SYS_LORSA_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LOREA_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LORN_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LORC_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LORID_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_ERRIDR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERRSELR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXADDR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXCTLR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXFR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC0_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC1_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC2_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC3_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXSTATUS_EL1, CGT_HCR_TERR), SR_TRAP(SYS_APIAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APIAKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APIBKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APIBKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDAKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDBKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDBKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APGAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APGAKEYHI_EL1, CGT_HCR_APK), /* All _EL2 registers */ SR_TRAP(SYS_BRBCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VPIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VMPIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_SCTLR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ACTLR_EL2, CGT_HCR_NV), SR_TRAP(SYS_SCTLR2_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_HCR_EL2, SYS_HCRX_EL2, CGT_HCR_NV), SR_TRAP(SYS_SMPRIMAP_EL2, CGT_HCR_NV), SR_TRAP(SYS_SMCR_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_TTBR0_EL2, SYS_TCR2_EL2, CGT_HCR_NV), SR_TRAP(SYS_VTTBR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VTCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VNCR_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_HDFGRTR_EL2, SYS_HAFGRTR_EL2, CGT_HCR_NV), /* Skip the SP_EL1 encoding... */ SR_TRAP(SYS_SPSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ELR_EL2, CGT_HCR_NV), /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */ SR_TRAP(SYS_AFSR0_EL2, CGT_HCR_NV), SR_TRAP(SYS_AFSR1_EL2, CGT_HCR_NV), SR_TRAP(SYS_ESR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VSESR_EL2, CGT_HCR_NV), SR_TRAP(SYS_TFSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_FAR_EL2, CGT_HCR_NV), SR_TRAP(SYS_HPFAR_EL2, CGT_HCR_NV), SR_TRAP(SYS_PMSCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_MAIR_EL2, CGT_HCR_NV), SR_TRAP(SYS_AMAIR_EL2, CGT_HCR_NV), SR_TRAP(SYS_MPAMHCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_MPAMVPMV_EL2, CGT_HCR_NV), SR_TRAP(SYS_MPAM2_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_MPAMVPM0_EL2, SYS_MPAMVPM7_EL2, CGT_HCR_NV), /* * Note that the spec. describes a group of MEC registers * whose access should not trap, therefore skip the following: * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2, * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2, * VMECID_P_EL2. */ SR_RANGE_TRAP(SYS_VBAR_EL2, SYS_RMR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VDISR_EL2, CGT_HCR_NV), /* ICH_AP0R<m>_EL2 */ SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2, SYS_ICH_AP0R3_EL2, CGT_HCR_NV), /* ICH_AP1R<m>_EL2 */ SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2, SYS_ICH_AP1R3_EL2, CGT_HCR_NV), SR_TRAP(SYS_ICC_SRE_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_ICH_HCR_EL2, SYS_ICH_EISR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ICH_ELRSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ICH_VMCR_EL2, CGT_HCR_NV), /* ICH_LR<m>_EL2 */ SR_RANGE_TRAP(SYS_ICH_LR0_EL2, SYS_ICH_LR15_EL2, CGT_HCR_NV), SR_TRAP(SYS_CONTEXTIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_TPIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_SCXTNUM_EL2, CGT_HCR_NV), /* AMEVCNTVOFF0<n>_EL2, AMEVCNTVOFF1<n>_EL2 */ SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0), SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV), /* CNT*_EL2 */ SR_TRAP(SYS_CNTVOFF_EL2, CGT_HCR_NV), SR_TRAP(SYS_CNTPOFF_EL2, CGT_HCR_NV), SR_TRAP(SYS_CNTHCTL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2, SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1ISNXS,CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1OSNXS,CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_CPP_RCTX, CGT_HCR_NV), SR_TRAP(OP_DVP_RCTX, CGT_HCR_NV), SR_TRAP(OP_CFP_RCTX, CGT_HCR_NV), SR_TRAP(SYS_SP_EL1, CGT_HCR_NV_nNV2), SR_TRAP(SYS_VBAR_EL1, CGT_HCR_NV1_nNV2), SR_TRAP(SYS_ELR_EL1, CGT_HCR_NV1_nNV2), SR_TRAP(SYS_SPSR_EL1, CGT_HCR_NV1_nNV2), SR_TRAP(SYS_SCXTNUM_EL1, CGT_HCR_NV1_nNV2_ENSCXT), SR_TRAP(SYS_SCXTNUM_EL0, CGT_HCR_ENSCXT), SR_TRAP(OP_AT_S1E1R, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E0R, CGT_HCR_AT), SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_PMCR_EL0, CGT_MDCR_TPM_TPMCR), SR_TRAP(SYS_PMCNTENSET_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCNTENCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMOVSSET_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_OSDTRRX_EL1, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_OSDTRTX_EL1, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_DBGDTR_EL0, CGT_MDCR_TDCC_TDE_TDA), /* * Also covers DBGDTRRX_EL0, which has the same encoding as * SYS_DBGDTRTX_EL0... */ SR_TRAP(SYS_DBGDTRTX_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDSCR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_OSECCR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGCLAIMSET_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGCLAIMCLR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGAUTHSTATUS_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_OSLAR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_OSLSR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_OSDLR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_DBGPRCR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_MDRAR_EL1, CGT_MDCR_TDE_TDRA), SR_TRAP(SYS_PMBLIMITR_EL1, CGT_MDCR_E2PB), SR_TRAP(SYS_PMBPTR_EL1, CGT_MDCR_E2PB), SR_TRAP(SYS_PMBSR_EL1, CGT_MDCR_E2PB), SR_TRAP(SYS_PMSCR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSEVFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSFCR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSICR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSIDR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF), SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBMAR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC), SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), /* op0=2, op1=1, and CRn<0b1000 */ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), /* * IMPDEF choice: * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for * ICC_SRE_EL1 access, and always handle it locally. */ SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); enum fg_filter_id { __NO_FGF__, HCRX_FGTnXS, /* Must be last */ __NR_FG_FILTER_IDS__ }; #define SR_FGF(sr, g, b, p, f) \ { \ .encoding = sr, \ .end = sr, \ .tc = { \ .fgt = g ## _GROUP, \ .bit = g ## _EL2_ ## b ## _SHIFT, \ .pol = p, \ .fgf = f, \ }, \ .line = __LINE__, \ } #define SR_FGT(sr, g, b, p) SR_FGF(sr, g, b, p, __NO_FGF__) static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { /* HFGRTR_EL2, HFGWTR_EL2 */ SR_FGT(SYS_AMAIR2_EL1, HFGxTR, nAMAIR2_EL1, 0), SR_FGT(SYS_MAIR2_EL1, HFGxTR, nMAIR2_EL1, 0), SR_FGT(SYS_S2POR_EL1, HFGxTR, nS2POR_EL1, 0), SR_FGT(SYS_POR_EL1, HFGxTR, nPOR_EL1, 0), SR_FGT(SYS_POR_EL0, HFGxTR, nPOR_EL0, 0), SR_FGT(SYS_PIR_EL1, HFGxTR, nPIR_EL1, 0), SR_FGT(SYS_PIRE0_EL1, HFGxTR, nPIRE0_EL1, 0), SR_FGT(SYS_RCWMASK_EL1, HFGxTR, nRCWMASK_EL1, 0), SR_FGT(SYS_TPIDR2_EL0, HFGxTR, nTPIDR2_EL0, 0), SR_FGT(SYS_SMPRI_EL1, HFGxTR, nSMPRI_EL1, 0), SR_FGT(SYS_GCSCR_EL1, HFGxTR, nGCS_EL1, 0), SR_FGT(SYS_GCSPR_EL1, HFGxTR, nGCS_EL1, 0), SR_FGT(SYS_GCSCRE0_EL1, HFGxTR, nGCS_EL0, 0), SR_FGT(SYS_GCSPR_EL0, HFGxTR, nGCS_EL0, 0), SR_FGT(SYS_ACCDATA_EL1, HFGxTR, nACCDATA_EL1, 0), SR_FGT(SYS_ERXADDR_EL1, HFGxTR, ERXADDR_EL1, 1), SR_FGT(SYS_ERXPFGCDN_EL1, HFGxTR, ERXPFGCDN_EL1, 1), SR_FGT(SYS_ERXPFGCTL_EL1, HFGxTR, ERXPFGCTL_EL1, 1), SR_FGT(SYS_ERXPFGF_EL1, HFGxTR, ERXPFGF_EL1, 1), SR_FGT(SYS_ERXMISC0_EL1, HFGxTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXMISC1_EL1, HFGxTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXMISC2_EL1, HFGxTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXMISC3_EL1, HFGxTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXSTATUS_EL1, HFGxTR, ERXSTATUS_EL1, 1), SR_FGT(SYS_ERXCTLR_EL1, HFGxTR, ERXCTLR_EL1, 1), SR_FGT(SYS_ERXFR_EL1, HFGxTR, ERXFR_EL1, 1), SR_FGT(SYS_ERRSELR_EL1, HFGxTR, ERRSELR_EL1, 1), SR_FGT(SYS_ERRIDR_EL1, HFGxTR, ERRIDR_EL1, 1), SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGxTR, ICC_IGRPENn_EL1, 1), SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGxTR, ICC_IGRPENn_EL1, 1), SR_FGT(SYS_VBAR_EL1, HFGxTR, VBAR_EL1, 1), SR_FGT(SYS_TTBR1_EL1, HFGxTR, TTBR1_EL1, 1), SR_FGT(SYS_TTBR0_EL1, HFGxTR, TTBR0_EL1, 1), SR_FGT(SYS_TPIDR_EL0, HFGxTR, TPIDR_EL0, 1), SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1), SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1), SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1), SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1), SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1), SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1), SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1), SR_FGT(SYS_REVIDR_EL1, HFGxTR, REVIDR_EL1, 1), SR_FGT(SYS_PAR_EL1, HFGxTR, PAR_EL1, 1), SR_FGT(SYS_MPIDR_EL1, HFGxTR, MPIDR_EL1, 1), SR_FGT(SYS_MIDR_EL1, HFGxTR, MIDR_EL1, 1), SR_FGT(SYS_MAIR_EL1, HFGxTR, MAIR_EL1, 1), SR_FGT(SYS_LORSA_EL1, HFGxTR, LORSA_EL1, 1), SR_FGT(SYS_LORN_EL1, HFGxTR, LORN_EL1, 1), SR_FGT(SYS_LORID_EL1, HFGxTR, LORID_EL1, 1), SR_FGT(SYS_LOREA_EL1, HFGxTR, LOREA_EL1, 1), SR_FGT(SYS_LORC_EL1, HFGxTR, LORC_EL1, 1), SR_FGT(SYS_ISR_EL1, HFGxTR, ISR_EL1, 1), SR_FGT(SYS_FAR_EL1, HFGxTR, FAR_EL1, 1), SR_FGT(SYS_ESR_EL1, HFGxTR, ESR_EL1, 1), SR_FGT(SYS_DCZID_EL0, HFGxTR, DCZID_EL0, 1), SR_FGT(SYS_CTR_EL0, HFGxTR, CTR_EL0, 1), SR_FGT(SYS_CSSELR_EL1, HFGxTR, CSSELR_EL1, 1), SR_FGT(SYS_CPACR_EL1, HFGxTR, CPACR_EL1, 1), SR_FGT(SYS_CONTEXTIDR_EL1, HFGxTR, CONTEXTIDR_EL1, 1), SR_FGT(SYS_CLIDR_EL1, HFGxTR, CLIDR_EL1, 1), SR_FGT(SYS_CCSIDR_EL1, HFGxTR, CCSIDR_EL1, 1), SR_FGT(SYS_APIBKEYLO_EL1, HFGxTR, APIBKey, 1), SR_FGT(SYS_APIBKEYHI_EL1, HFGxTR, APIBKey, 1), SR_FGT(SYS_APIAKEYLO_EL1, HFGxTR, APIAKey, 1), SR_FGT(SYS_APIAKEYHI_EL1, HFGxTR, APIAKey, 1), SR_FGT(SYS_APGAKEYLO_EL1, HFGxTR, APGAKey, 1), SR_FGT(SYS_APGAKEYHI_EL1, HFGxTR, APGAKey, 1), SR_FGT(SYS_APDBKEYLO_EL1, HFGxTR, APDBKey, 1), SR_FGT(SYS_APDBKEYHI_EL1, HFGxTR, APDBKey, 1), SR_FGT(SYS_APDAKEYLO_EL1, HFGxTR, APDAKey, 1), SR_FGT(SYS_APDAKEYHI_EL1, HFGxTR, APDAKey, 1), SR_FGT(SYS_AMAIR_EL1, HFGxTR, AMAIR_EL1, 1), SR_FGT(SYS_AIDR_EL1, HFGxTR, AIDR_EL1, 1), SR_FGT(SYS_AFSR1_EL1, HFGxTR, AFSR1_EL1, 1), SR_FGT(SYS_AFSR0_EL1, HFGxTR, AFSR0_EL1, 1), /* HFGITR_EL2 */ SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1), SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1), SR_FGT(OP_GCSPUSHX, HFGITR, nGCSEPP, 0), SR_FGT(OP_GCSPOPX, HFGITR, nGCSEPP, 0), SR_FGT(OP_GCSPUSHM, HFGITR, nGCSPUSHM_EL1, 0), SR_FGT(OP_BRB_IALL, HFGITR, nBRBIALL, 0), SR_FGT(OP_BRB_INJ, HFGITR, nBRBINJ, 0), SR_FGT(SYS_DC_CVAC, HFGITR, DCCVAC, 1), SR_FGT(SYS_DC_CGVAC, HFGITR, DCCVAC, 1), SR_FGT(SYS_DC_CGDVAC, HFGITR, DCCVAC, 1), SR_FGT(OP_CPP_RCTX, HFGITR, CPPRCTX, 1), SR_FGT(OP_DVP_RCTX, HFGITR, DVPRCTX, 1), SR_FGT(OP_CFP_RCTX, HFGITR, CFPRCTX, 1), SR_FGT(OP_TLBI_VAALE1, HFGITR, TLBIVAALE1, 1), SR_FGT(OP_TLBI_VALE1, HFGITR, TLBIVALE1, 1), SR_FGT(OP_TLBI_VAAE1, HFGITR, TLBIVAAE1, 1), SR_FGT(OP_TLBI_ASIDE1, HFGITR, TLBIASIDE1, 1), SR_FGT(OP_TLBI_VAE1, HFGITR, TLBIVAE1, 1), SR_FGT(OP_TLBI_VMALLE1, HFGITR, TLBIVMALLE1, 1), SR_FGT(OP_TLBI_RVAALE1, HFGITR, TLBIRVAALE1, 1), SR_FGT(OP_TLBI_RVALE1, HFGITR, TLBIRVALE1, 1), SR_FGT(OP_TLBI_RVAAE1, HFGITR, TLBIRVAAE1, 1), SR_FGT(OP_TLBI_RVAE1, HFGITR, TLBIRVAE1, 1), SR_FGT(OP_TLBI_RVAALE1IS, HFGITR, TLBIRVAALE1IS, 1), SR_FGT(OP_TLBI_RVALE1IS, HFGITR, TLBIRVALE1IS, 1), SR_FGT(OP_TLBI_RVAAE1IS, HFGITR, TLBIRVAAE1IS, 1), SR_FGT(OP_TLBI_RVAE1IS, HFGITR, TLBIRVAE1IS, 1), SR_FGT(OP_TLBI_VAALE1IS, HFGITR, TLBIVAALE1IS, 1), SR_FGT(OP_TLBI_VALE1IS, HFGITR, TLBIVALE1IS, 1), SR_FGT(OP_TLBI_VAAE1IS, HFGITR, TLBIVAAE1IS, 1), SR_FGT(OP_TLBI_ASIDE1IS, HFGITR, TLBIASIDE1IS, 1), SR_FGT(OP_TLBI_VAE1IS, HFGITR, TLBIVAE1IS, 1), SR_FGT(OP_TLBI_VMALLE1IS, HFGITR, TLBIVMALLE1IS, 1), SR_FGT(OP_TLBI_RVAALE1OS, HFGITR, TLBIRVAALE1OS, 1), SR_FGT(OP_TLBI_RVALE1OS, HFGITR, TLBIRVALE1OS, 1), SR_FGT(OP_TLBI_RVAAE1OS, HFGITR, TLBIRVAAE1OS, 1), SR_FGT(OP_TLBI_RVAE1OS, HFGITR, TLBIRVAE1OS, 1), SR_FGT(OP_TLBI_VAALE1OS, HFGITR, TLBIVAALE1OS, 1), SR_FGT(OP_TLBI_VALE1OS, HFGITR, TLBIVALE1OS, 1), SR_FGT(OP_TLBI_VAAE1OS, HFGITR, TLBIVAAE1OS, 1), SR_FGT(OP_TLBI_ASIDE1OS, HFGITR, TLBIASIDE1OS, 1), SR_FGT(OP_TLBI_VAE1OS, HFGITR, TLBIVAE1OS, 1), SR_FGT(OP_TLBI_VMALLE1OS, HFGITR, TLBIVMALLE1OS, 1), /* nXS variants must be checked against HCRX_EL2.FGTnXS */ SR_FGF(OP_TLBI_VAALE1NXS, HFGITR, TLBIVAALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VALE1NXS, HFGITR, TLBIVALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAAE1NXS, HFGITR, TLBIVAAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_ASIDE1NXS, HFGITR, TLBIASIDE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAE1NXS, HFGITR, TLBIVAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VMALLE1NXS, HFGITR, TLBIVMALLE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAALE1NXS, HFGITR, TLBIRVAALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVALE1NXS, HFGITR, TLBIRVALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAAE1NXS, HFGITR, TLBIRVAAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAE1NXS, HFGITR, TLBIRVAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAALE1ISNXS, HFGITR, TLBIRVAALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVALE1ISNXS, HFGITR, TLBIRVALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAAE1ISNXS, HFGITR, TLBIRVAAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAE1ISNXS, HFGITR, TLBIRVAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAALE1ISNXS, HFGITR, TLBIVAALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VALE1ISNXS, HFGITR, TLBIVALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAAE1ISNXS, HFGITR, TLBIVAAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_ASIDE1ISNXS, HFGITR, TLBIASIDE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAE1ISNXS, HFGITR, TLBIVAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VMALLE1ISNXS, HFGITR, TLBIVMALLE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAALE1OSNXS, HFGITR, TLBIRVAALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVALE1OSNXS, HFGITR, TLBIRVALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAAE1OSNXS, HFGITR, TLBIRVAAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAE1OSNXS, HFGITR, TLBIRVAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAALE1OSNXS, HFGITR, TLBIVAALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VALE1OSNXS, HFGITR, TLBIVALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAAE1OSNXS, HFGITR, TLBIVAAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_ASIDE1OSNXS, HFGITR, TLBIASIDE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAE1OSNXS, HFGITR, TLBIVAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VMALLE1OSNXS, HFGITR, TLBIVMALLE1OS, 1, HCRX_FGTnXS), SR_FGT(OP_AT_S1E1WP, HFGITR, ATS1E1WP, 1), SR_FGT(OP_AT_S1E1RP, HFGITR, ATS1E1RP, 1), SR_FGT(OP_AT_S1E0W, HFGITR, ATS1E0W, 1), SR_FGT(OP_AT_S1E0R, HFGITR, ATS1E0R, 1), SR_FGT(OP_AT_S1E1W, HFGITR, ATS1E1W, 1), SR_FGT(OP_AT_S1E1R, HFGITR, ATS1E1R, 1), SR_FGT(SYS_DC_ZVA, HFGITR, DCZVA, 1), SR_FGT(SYS_DC_GVA, HFGITR, DCZVA, 1), SR_FGT(SYS_DC_GZVA, HFGITR, DCZVA, 1), SR_FGT(SYS_DC_CIVAC, HFGITR, DCCIVAC, 1), SR_FGT(SYS_DC_CIGVAC, HFGITR, DCCIVAC, 1), SR_FGT(SYS_DC_CIGDVAC, HFGITR, DCCIVAC, 1), SR_FGT(SYS_DC_CVADP, HFGITR, DCCVADP, 1), SR_FGT(SYS_DC_CGVADP, HFGITR, DCCVADP, 1), SR_FGT(SYS_DC_CGDVADP, HFGITR, DCCVADP, 1), SR_FGT(SYS_DC_CVAP, HFGITR, DCCVAP, 1), SR_FGT(SYS_DC_CGVAP, HFGITR, DCCVAP, 1), SR_FGT(SYS_DC_CGDVAP, HFGITR, DCCVAP, 1), SR_FGT(SYS_DC_CVAU, HFGITR, DCCVAU, 1), SR_FGT(SYS_DC_CISW, HFGITR, DCCISW, 1), SR_FGT(SYS_DC_CIGSW, HFGITR, DCCISW, 1), SR_FGT(SYS_DC_CIGDSW, HFGITR, DCCISW, 1), SR_FGT(SYS_DC_CSW, HFGITR, DCCSW, 1), SR_FGT(SYS_DC_CGSW, HFGITR, DCCSW, 1), SR_FGT(SYS_DC_CGDSW, HFGITR, DCCSW, 1), SR_FGT(SYS_DC_ISW, HFGITR, DCISW, 1), SR_FGT(SYS_DC_IGSW, HFGITR, DCISW, 1), SR_FGT(SYS_DC_IGDSW, HFGITR, DCISW, 1), SR_FGT(SYS_DC_IVAC, HFGITR, DCIVAC, 1), SR_FGT(SYS_DC_IGVAC, HFGITR, DCIVAC, 1), SR_FGT(SYS_DC_IGDVAC, HFGITR, DCIVAC, 1), SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1), SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1), SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1), /* HDFGRTR_EL2 */ SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1), SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0), SR_FGT(SYS_BRBINF_EL1(0), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(1), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(2), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(3), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(4), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(5), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(6), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(7), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(8), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(9), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(10), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(11), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(12), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(13), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(14), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(15), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(16), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(17), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(18), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(19), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(20), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(21), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(22), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(23), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(24), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(25), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(26), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(27), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(28), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(29), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(30), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(31), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINFINJ_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(0), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(1), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(2), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(3), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(4), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(5), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(6), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(7), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(8), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(9), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(10), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(11), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(12), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(13), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(14), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(15), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(16), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(17), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(18), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(19), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(20), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(21), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(22), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(23), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(24), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(25), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(26), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(27), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(28), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(29), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(30), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(31), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRCINJ_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(0), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(1), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(2), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(3), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(4), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(5), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(6), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(7), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(8), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(9), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(10), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(11), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(12), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(13), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(14), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(15), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(16), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(17), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(18), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(19), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(20), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(21), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(22), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(23), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(24), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(25), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(26), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(27), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(28), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(29), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(30), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(31), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGTINJ_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTS_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBCR_EL1, HDFGRTR, nBRBCTL, 0), SR_FGT(SYS_BRBFCR_EL1, HDFGRTR, nBRBCTL, 0), SR_FGT(SYS_BRBIDR0_EL1, HDFGRTR, nBRBIDR, 0), SR_FGT(SYS_PMCEID0_EL0, HDFGRTR, PMCEIDn_EL0, 1), SR_FGT(SYS_PMCEID1_EL0, HDFGRTR, PMCEIDn_EL0, 1), SR_FGT(SYS_PMUSERENR_EL0, HDFGRTR, PMUSERENR_EL0, 1), SR_FGT(SYS_TRBTRG_EL1, HDFGRTR, TRBTRG_EL1, 1), SR_FGT(SYS_TRBSR_EL1, HDFGRTR, TRBSR_EL1, 1), SR_FGT(SYS_TRBPTR_EL1, HDFGRTR, TRBPTR_EL1, 1), SR_FGT(SYS_TRBMAR_EL1, HDFGRTR, TRBMAR_EL1, 1), SR_FGT(SYS_TRBLIMITR_EL1, HDFGRTR, TRBLIMITR_EL1, 1), SR_FGT(SYS_TRBIDR_EL1, HDFGRTR, TRBIDR_EL1, 1), SR_FGT(SYS_TRBBASER_EL1, HDFGRTR, TRBBASER_EL1, 1), SR_FGT(SYS_TRCVICTLR, HDFGRTR, TRCVICTLR, 1), SR_FGT(SYS_TRCSTATR, HDFGRTR, TRCSTATR, 1), SR_FGT(SYS_TRCSSCSR(0), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(1), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(2), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(3), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(4), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(5), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(6), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(7), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSEQSTR, HDFGRTR, TRCSEQSTR, 1), SR_FGT(SYS_TRCPRGCTLR, HDFGRTR, TRCPRGCTLR, 1), SR_FGT(SYS_TRCOSLSR, HDFGRTR, TRCOSLSR, 1), SR_FGT(SYS_TRCIMSPEC(0), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(1), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(2), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(3), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(4), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(5), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(6), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(7), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCDEVARCH, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCDEVID, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR0, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR1, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR2, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR3, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR4, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR5, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR6, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR7, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR8, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR9, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR10, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR11, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR12, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR13, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCCNTVR(0), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCNTVR(1), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCNTVR(2), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCNTVR(3), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCLAIMCLR, HDFGRTR, TRCCLAIM, 1), SR_FGT(SYS_TRCCLAIMSET, HDFGRTR, TRCCLAIM, 1), SR_FGT(SYS_TRCAUXCTLR, HDFGRTR, TRCAUXCTLR, 1), SR_FGT(SYS_TRCAUTHSTATUS, HDFGRTR, TRCAUTHSTATUS, 1), SR_FGT(SYS_TRCACATR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(8), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(9), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(10), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(11), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(12), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(13), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(14), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(15), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(8), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(9), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(10), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(11), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(12), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(13), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(14), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(15), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCBBCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCCCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCCTLR0, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCCTLR1, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCONFIGR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEVENTCTL0R, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEVENTCTL1R, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCQCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(8), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(9), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(10), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(11), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(12), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(13), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(14), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(15), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(16), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(17), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(18), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(19), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(20), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(21), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(22), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(23), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(24), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(25), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(26), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(27), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(28), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(29), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(30), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(31), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQEVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQEVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQEVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQRSTEVR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSTALLCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSYNCPR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCTRACEIDR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCTSCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVIIECTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVIPCSSCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVISSCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCCTLR0, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCCTLR1, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_PMSLATFR_EL1, HDFGRTR, PMSLATFR_EL1, 1), SR_FGT(SYS_PMSIRR_EL1, HDFGRTR, PMSIRR_EL1, 1), SR_FGT(SYS_PMSIDR_EL1, HDFGRTR, PMSIDR_EL1, 1), SR_FGT(SYS_PMSICR_EL1, HDFGRTR, PMSICR_EL1, 1), SR_FGT(SYS_PMSFCR_EL1, HDFGRTR, PMSFCR_EL1, 1), SR_FGT(SYS_PMSEVFR_EL1, HDFGRTR, PMSEVFR_EL1, 1), SR_FGT(SYS_PMSCR_EL1, HDFGRTR, PMSCR_EL1, 1), SR_FGT(SYS_PMBSR_EL1, HDFGRTR, PMBSR_EL1, 1), SR_FGT(SYS_PMBPTR_EL1, HDFGRTR, PMBPTR_EL1, 1), SR_FGT(SYS_PMBLIMITR_EL1, HDFGRTR, PMBLIMITR_EL1, 1), SR_FGT(SYS_PMMIR_EL1, HDFGRTR, PMMIR_EL1, 1), SR_FGT(SYS_PMSELR_EL0, HDFGRTR, PMSELR_EL0, 1), SR_FGT(SYS_PMOVSCLR_EL0, HDFGRTR, PMOVS, 1), SR_FGT(SYS_PMOVSSET_EL0, HDFGRTR, PMOVS, 1), SR_FGT(SYS_PMINTENCLR_EL1, HDFGRTR, PMINTEN, 1), SR_FGT(SYS_PMINTENSET_EL1, HDFGRTR, PMINTEN, 1), SR_FGT(SYS_PMCNTENCLR_EL0, HDFGRTR, PMCNTEN, 1), SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1), SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1), SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(0), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(1), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(2), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(3), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(4), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(5), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(6), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(7), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(8), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(9), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(10), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(11), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(12), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(13), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(14), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(15), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(16), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(17), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(18), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(19), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(20), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(21), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(22), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(23), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(24), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(25), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(26), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(27), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(28), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(29), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVTYPERn_EL0(30), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(0), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(1), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(2), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(3), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(4), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(5), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(6), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(7), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(8), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(9), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(10), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(11), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(12), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(13), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(14), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(15), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(16), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(17), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(18), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(19), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(20), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(21), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(22), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(23), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(24), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(25), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(26), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(27), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(28), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(29), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_PMEVCNTRn_EL0(30), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1), SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1), SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1), SR_FGT(SYS_DBGPRCR_EL1, HDFGRTR, DBGPRCR_EL1, 1), SR_FGT(SYS_DBGAUTHSTATUS_EL1, HDFGRTR, DBGAUTHSTATUS_EL1, 1), SR_FGT(SYS_DBGCLAIMSET_EL1, HDFGRTR, DBGCLAIM, 1), SR_FGT(SYS_DBGCLAIMCLR_EL1, HDFGRTR, DBGCLAIM, 1), SR_FGT(SYS_MDSCR_EL1, HDFGRTR, MDSCR_EL1, 1), /* * The trap bits capture *64* debug registers per bit, but the * ARM ARM only describes the encoding for the first 16, and * we don't really support more than that anyway. */ SR_FGT(SYS_DBGWVRn_EL1(0), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(1), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(2), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(3), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(4), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(5), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(6), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(7), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(8), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(9), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(10), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(11), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(12), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(13), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(14), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(15), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(0), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(1), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(2), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(3), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(4), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(5), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(6), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(7), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(8), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(9), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(10), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(11), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(12), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(13), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(14), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(15), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(0), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(1), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(2), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(3), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(4), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(5), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(6), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(7), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(8), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(9), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(10), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(11), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(12), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(13), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(14), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(15), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(0), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(1), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(2), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(3), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(4), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(5), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(6), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(7), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(8), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(9), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(10), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(11), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(12), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1), /* * HDFGWTR_EL2 * * Although HDFGRTR_EL2 and HDFGWTR_EL2 registers largely * overlap in their bit assignment, there are a number of bits * that are RES0 on one side, and an actual trap bit on the * other. The policy chosen here is to describe all the * read-side mappings, and only the write-side mappings that * differ from the read side, and the trap handler will pick * the correct shadow register based on the access type. */ SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1), SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1), SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1), SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1), SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1), /* * HAFGRTR_EL2 */ SR_FGT(SYS_AMEVTYPER1_EL0(15), HAFGRTR, AMEVTYPER115_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(14), HAFGRTR, AMEVTYPER114_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(13), HAFGRTR, AMEVTYPER113_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(12), HAFGRTR, AMEVTYPER112_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(11), HAFGRTR, AMEVTYPER111_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(10), HAFGRTR, AMEVTYPER110_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(9), HAFGRTR, AMEVTYPER19_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(8), HAFGRTR, AMEVTYPER18_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(7), HAFGRTR, AMEVTYPER17_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(6), HAFGRTR, AMEVTYPER16_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(5), HAFGRTR, AMEVTYPER15_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(4), HAFGRTR, AMEVTYPER14_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(3), HAFGRTR, AMEVTYPER13_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(2), HAFGRTR, AMEVTYPER12_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(1), HAFGRTR, AMEVTYPER11_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(0), HAFGRTR, AMEVTYPER10_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(15), HAFGRTR, AMEVCNTR115_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(14), HAFGRTR, AMEVCNTR114_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(13), HAFGRTR, AMEVCNTR113_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(12), HAFGRTR, AMEVCNTR112_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(11), HAFGRTR, AMEVCNTR111_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(10), HAFGRTR, AMEVCNTR110_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(9), HAFGRTR, AMEVCNTR19_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(8), HAFGRTR, AMEVCNTR18_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(7), HAFGRTR, AMEVCNTR17_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(6), HAFGRTR, AMEVCNTR16_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(5), HAFGRTR, AMEVCNTR15_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(4), HAFGRTR, AMEVCNTR14_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(3), HAFGRTR, AMEVCNTR13_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(2), HAFGRTR, AMEVCNTR12_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(1), HAFGRTR, AMEVCNTR11_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(0), HAFGRTR, AMEVCNTR10_EL0, 1), SR_FGT(SYS_AMCNTENCLR1_EL0, HAFGRTR, AMCNTEN1, 1), SR_FGT(SYS_AMCNTENSET1_EL0, HAFGRTR, AMCNTEN1, 1), SR_FGT(SYS_AMCNTENCLR0_EL0, HAFGRTR, AMCNTEN0, 1), SR_FGT(SYS_AMCNTENSET0_EL0, HAFGRTR, AMCNTEN0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(3), HAFGRTR, AMEVCNTR03_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), }; static union trap_config get_trap_config(u32 sysreg) { return (union trap_config) { .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), }; } static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, const char *type, int err) { kvm_err("%s line %d encoding range " "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", type, tc->line, sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), sys_reg_Op2(tc->encoding), sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), sys_reg_Op2(tc->end), err); } static u32 encoding_next(u32 encoding) { u8 op0, op1, crn, crm, op2; op0 = sys_reg_Op0(encoding); op1 = sys_reg_Op1(encoding); crn = sys_reg_CRn(encoding); crm = sys_reg_CRm(encoding); op2 = sys_reg_Op2(encoding); if (op2 < Op2_mask) return sys_reg(op0, op1, crn, crm, op2 + 1); if (crm < CRm_mask) return sys_reg(op0, op1, crn, crm + 1, 0); if (crn < CRn_mask) return sys_reg(op0, op1, crn + 1, 0, 0); if (op1 < Op1_mask) return sys_reg(op0, op1 + 1, 0, 0, 0); return sys_reg(op0 + 1, 0, 0, 0, 0); } int __init populate_nv_trap_config(void) { int ret = 0; BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS)); BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS)); for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; void *prev; if (cgt->tc.val & BIT(63)) { kvm_err("CGT[%d] has MBZ bit set\n", i); ret = -EINVAL; } for (u32 enc = cgt->encoding; enc <= cgt->end; enc = encoding_next(enc)) { prev = xa_store(&sr_forward_xa, enc, xa_mk_value(cgt->tc.val), GFP_KERNEL); if (prev && !xa_is_err(prev)) { ret = -EINVAL; print_nv_trap_error(cgt, "Duplicate CGT", ret); } if (xa_is_err(prev)) { ret = xa_err(prev); print_nv_trap_error(cgt, "Failed CGT insertion", ret); } } } kvm_info("nv: %ld coarse grained trap handlers\n", ARRAY_SIZE(encoding_to_cgt)); if (!cpus_have_final_cap(ARM64_HAS_FGT)) goto check_mcb; for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; union trap_config tc; void *prev; if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) { ret = -EINVAL; print_nv_trap_error(fgt, "Invalid FGT", ret); } tc = get_trap_config(fgt->encoding); if (tc.fgt) { ret = -EINVAL; print_nv_trap_error(fgt, "Duplicate FGT", ret); } tc.val |= fgt->tc.val; prev = xa_store(&sr_forward_xa, fgt->encoding, xa_mk_value(tc.val), GFP_KERNEL); if (xa_is_err(prev)) { ret = xa_err(prev); print_nv_trap_error(fgt, "Failed FGT insertion", ret); } } kvm_info("nv: %ld fine grained trap handlers\n", ARRAY_SIZE(encoding_to_fgt)); check_mcb: for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { const enum cgt_group_id *cgids; cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) { if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && cgids[i] < __COMPLEX_CONDITIONS__) { kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); ret = -EINVAL; } } } if (ret) xa_destroy(&sr_forward_xa); return ret; } int __init populate_sysreg_config(const struct sys_reg_desc *sr, unsigned int idx) { union trap_config tc; u32 encoding; void *ret; /* * 0 is a valid value for the index, but not for the storage. * We'll store (idx+1), so check against an offset'd limit. */ if (idx >= (BIT(TC_SRI_BITS) - 1)) { kvm_err("sysreg %s (%d) out of range\n", sr->name, idx); return -EINVAL; } encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2); tc = get_trap_config(encoding); if (tc.sri) { kvm_err("sysreg %s (%d) duplicate entry (%d)\n", sr->name, idx - 1, tc.sri); return -EINVAL; } tc.sri = idx + 1; ret = xa_store(&sr_forward_xa, encoding, xa_mk_value(tc.val), GFP_KERNEL); return xa_err(ret); } static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, const struct trap_bits *tb) { enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; u64 val; val = __vcpu_sys_reg(vcpu, tb->index); if ((val & tb->mask) == tb->value) b |= tb->behaviour; return b; } static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, const enum cgt_group_id id, enum trap_behaviour b) { switch (id) { const enum cgt_group_id *cgids; case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: if (likely(id != __RESERVED__)) b |= get_behaviour(vcpu, &coarse_trap_bits[id]); break; case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: /* Yes, this is recursive. Don't do anything stupid. */ cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) b |= __compute_trap_behaviour(vcpu, cgids[i], b); break; default: if (ARRAY_SIZE(ccc)) b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); break; } return b; } static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, const union trap_config tc) { enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; return __compute_trap_behaviour(vcpu, tc.cgt, b); } static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) { struct kvm_sysreg_masks *masks; /* Only handle the VNCR-backed regs for now */ if (sr < __VNCR_START__) return 0; masks = kvm->arch.sysreg_masks; return masks->mask[sr - __VNCR_START__].res0; } static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, u64 val, const union trap_config tc) { struct kvm *kvm = vcpu->kvm; enum vcpu_sysreg sr; /* * KVM doesn't know about any FGTs that apply to the host, and hopefully * that'll remain the case. */ if (is_hyp_ctxt(vcpu)) return false; if (tc.pol) return (val & BIT(tc.bit)); /* * FGTs with negative polarities are an absolute nightmare, as * we need to evaluate the bit in the light of the feature * that defines it. WTF were they thinking? * * So let's check if the bit has been earmarked as RES0, as * this indicates an unimplemented feature. */ if (val & BIT(tc.bit)) return false; switch ((enum fgt_group_id)tc.fgt) { case HFGxTR_GROUP: sr = is_read ? HFGRTR_EL2 : HFGWTR_EL2; break; case HDFGRTR_GROUP: sr = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; break; case HAFGRTR_GROUP: sr = HAFGRTR_EL2; break; case HFGITR_GROUP: sr = HFGITR_EL2; break; default: WARN_ONCE(1, "Unhandled FGT group"); return false; } return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); } bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) { union trap_config tc; enum trap_behaviour b; bool is_read; u32 sysreg; u64 esr, val; esr = kvm_vcpu_get_esr(vcpu); sysreg = esr_sys64_to_sysreg(esr); is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; tc = get_trap_config(sysreg); /* * A value of 0 for the whole entry means that we know nothing * for this sysreg, and that it cannot be re-injected into the * nested hypervisor. In this situation, let's cut it short. */ if (!tc.val) goto local; /* * If a sysreg can be trapped using a FGT, first check whether we * trap for the purpose of forbidding the feature. In that case, * inject an UNDEF. */ if (tc.fgt != __NO_FGT_GROUP__ && (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) { kvm_inject_undefined(vcpu); return true; } /* * If we're not nesting, immediately return to the caller, with the * sysreg index, should we have it. */ if (!vcpu_has_nv(vcpu)) goto local; /* * There are a few traps that take effect InHost, but are constrained * to EL0. Don't bother with computing the trap behaviour if the vCPU * isn't in EL0. */ if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) goto local; switch ((enum fgt_group_id)tc.fgt) { case __NO_FGT_GROUP__: break; case HFGxTR_GROUP: if (is_read) val = __vcpu_sys_reg(vcpu, HFGRTR_EL2); else val = __vcpu_sys_reg(vcpu, HFGWTR_EL2); break; case HDFGRTR_GROUP: if (is_read) val = __vcpu_sys_reg(vcpu, HDFGRTR_EL2); else val = __vcpu_sys_reg(vcpu, HDFGWTR_EL2); break; case HAFGRTR_GROUP: val = __vcpu_sys_reg(vcpu, HAFGRTR_EL2); break; case HFGITR_GROUP: val = __vcpu_sys_reg(vcpu, HFGITR_EL2); switch (tc.fgf) { u64 tmp; case __NO_FGF__: break; case HCRX_FGTnXS: tmp = __vcpu_sys_reg(vcpu, HCRX_EL2); if (tmp & HCRX_EL2_FGTnXS) tc.fgt = __NO_FGT_GROUP__; } break; case __NR_FGT_GROUP_IDS__: /* Something is really wrong, bail out */ WARN_ONCE(1, "__NR_FGT_GROUP_IDS__"); goto local; } if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) goto local; if (((b & BEHAVE_FORWARD_READ) && is_read) || ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; local: if (!tc.sri) { struct sys_reg_params params; params = esr_sys64_to_params(esr); /* * Check for the IMPDEF range, as per DDI0487 J.a, * D18.3.2 Reserved encodings for IMPLEMENTATION * DEFINED registers. */ if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011)) print_sys_reg_msg(&params, "Unsupported guest access at: %lx\n", *vcpu_pc(vcpu)); kvm_inject_undefined(vcpu); return true; } *sr_index = tc.sri - 1; return false; inject: trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return true; } static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) { bool control_bit_set; if (!vcpu_has_nv(vcpu)) return false; control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit; if (!is_hyp_ctxt(vcpu) && control_bit_set) { kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return true; } return false; } static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) { return __forward_traps(vcpu, HCR_EL2, control_bit); } bool forward_smc_trap(struct kvm_vcpu *vcpu) { return forward_hcr_traps(vcpu, HCR_TSC); } static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) { return __forward_traps(vcpu, MDCR_EL2, control_bit); } bool forward_debug_exception(struct kvm_vcpu *vcpu) { return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); } static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) { u64 mode = spsr & PSR_MODE_MASK; /* * Possible causes for an Illegal Exception Return from EL2: * - trying to return to EL3 * - trying to return to an illegal M value * - trying to return to a 32bit EL * - trying to return to EL1 with HCR_EL2.TGE set */ if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h || mode == 0b00001 || (mode & BIT(1)) || (spsr & PSR_MODE32_BIT) || (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t || mode == PSR_MODE_EL1h))) { /* * The guest is playing with our nerves. Preserve EL, SP, * masks, flags from the existing PSTATE, and set IL. * The HW will then generate an Illegal State Exception * immediately after ERET. */ spsr = *vcpu_cpsr(vcpu); spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | PSR_MODE_MASK | PSR_MODE32_BIT); spsr |= PSR_IL_BIT; } return spsr; } void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) { u64 spsr, elr, esr; /* * Forward this trap to the virtual EL2 if the virtual * HCR_EL2.NV bit is set and this is coming from !EL2. */ if (forward_hcr_traps(vcpu, HCR_NV)) return; spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); spsr = kvm_check_illegal_exception_return(vcpu, spsr); /* Check for an ERETAx */ esr = kvm_vcpu_get_esr(vcpu); if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { /* * Oh no, ERETAx failed to authenticate. * * If we have FPACCOMBINE and we don't have a pending * Illegal Execution State exception (which has priority * over FPAC), deliver an exception right away. * * Otherwise, let the mangled ELR value trickle down the * ERET handling, and the guest will have a little surprise. */ if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { esr &= ESR_ELx_ERET_ISS_ERETA; esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); kvm_inject_nested_sync(vcpu, esr); return; } } preempt_disable(); kvm_arch_vcpu_put(vcpu); if (!esr_iss_is_eretax(esr)) elr = __vcpu_sys_reg(vcpu, ELR_EL2); trace_kvm_nested_eret(vcpu, elr, spsr); *vcpu_pc(vcpu) = elr; *vcpu_cpsr(vcpu) = spsr; kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, enum exception_type type) { trace_kvm_inject_nested_exception(vcpu, esr_el2, type); switch (type) { case except_type_sync: kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2); break; case except_type_irq: kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ); break; default: WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type); } } /* * Emulate taking an exception to EL2. * See ARM ARM J8.1.2 AArch64.TakeException() */ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, enum exception_type type) { u64 pstate, mode; bool direct_inject; if (!vcpu_has_nv(vcpu)) { kvm_err("Unexpected call to %s for the non-nesting configuration\n", __func__); return -EINVAL; } /* * As for ERET, we can avoid doing too much on the injection path by * checking that we either took the exception from a VHE host * userspace or from vEL2. In these cases, there is no change in * translation regime (or anything else), so let's do as little as * possible. */ pstate = *vcpu_cpsr(vcpu); mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); direct_inject = (mode == PSR_MODE_EL0t && vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)); direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); if (direct_inject) { kvm_inject_el2_exception(vcpu, esr_el2, type); return 1; } preempt_disable(); /* * We may have an exception or PC update in the EL0/EL1 context. * Commit it before entering EL2. */ __kvm_adjust_pc(vcpu); kvm_arch_vcpu_put(vcpu); kvm_inject_el2_exception(vcpu, esr_el2, type); /* * A hard requirement is that a switch between EL1 and EL2 * contexts has to happen between a put/load, so that we can * pick the correct timer and interrupt configuration, among * other things. * * Make sure the exception actually took place before we load * the new context. */ __kvm_adjust_pc(vcpu); kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); kvm_pmu_nested_transition(vcpu); return 1; } int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2) { return kvm_inject_nested(vcpu, esr_el2, except_type_sync); } int kvm_inject_nested_irq(struct kvm_vcpu *vcpu) { /* * Do not inject an irq if the: * - Current exception level is EL2, and * - virtual HCR_EL2.TGE == 0 * - virtual HCR_EL2.IMO == 0 * * See Table D1-17 "Physical interrupt target and masking when EL3 is * not implemented and EL2 is implemented" in ARM DDI 0487C.a. */ if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) && !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO)) return 1; /* esr_el2 value doesn't matter for exits due to irqs. */ return kvm_inject_nested(vcpu, 0, except_type_irq); }
156 156 156 155 156 156 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: * The group->recnt and mark->refcnt tell how many "things" in the kernel * currently are referencing the objects. Both kind of objects typically will * live inside the kernel with a refcnt of 2, one for its creation and one for * the reference a group and a mark hold to each other. * If you are holding the appropriate locks, you can take a reference and the * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: * There are 3 locks involved with fsnotify inode marks and they MUST be taken * in order as follows: * * group->mark_mutex * mark->lock * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private * data (i.e group limits). * mark->lock protects the marks attributes like its masks and flags. * Furthermore it protects the access to a reference of the group that the mark * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * * mark->connector->lock protects the list of marks anchored inside an * inode / vfsmount and each mark is hooked via the i_list. * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets * detached from fsnotify_mark_connector when last reference to the mark is * dropped. Thus having mark reference is enough to protect mark->connector * pointer and to make sure fsnotify_mark_connector cannot disappear. Also * because we remove mark from g_list before dropping mark reference associated * with that, any mark found through g_list is guaranteed to have * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) * - The inode is being evicted from cache. (fsnotify_inode_delete) * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> #include <linux/ratelimit.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; struct kmem_cache *fsnotify_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); static void fsnotify_connector_destroy_workfn(struct work_struct *work); static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { WARN_ON_ONCE(!refcount_read(&mark->refcnt)); refcount_inc(&mark->refcnt); } static fsnotify_connp_t *fsnotify_object_connp(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return &((struct inode *)obj)->i_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); default: return NULL; } } static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; return NULL; } __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) { if (WARN_ON(!fsnotify_valid_obj_type(conn->type))) return 0; return READ_ONCE(*fsnotify_conn_mask_p(conn)); } static void fsnotify_get_sb_watched_objects(struct super_block *sb) { atomic_long_inc(fsnotify_sb_watched_objects(sb)); } static void fsnotify_put_sb_watched_objects(struct super_block *sb) { atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb); /* the superblock can go away after this decrement */ if (atomic_long_dec_and_test(watched_objects)) wake_up_var(watched_objects); } static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); fsnotify_get_sb_watched_objects(inode->i_sb); } static void fsnotify_put_inode_ref(struct inode *inode) { /* read ->i_sb before the inode can go away */ struct super_block *sb = inode->i_sb; iput(inode); fsnotify_put_sb_watched_objects(sb); } /* * Grab or drop watched objects reference depending on whether the connector * is attached and has any marks attached. */ static void fsnotify_update_sb_watchers(struct super_block *sb, struct fsnotify_mark_connector *conn) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; struct fsnotify_mark *first_mark = NULL; unsigned int highest_prio = 0; if (conn->obj) first_mark = hlist_entry_safe(conn->list.first, struct fsnotify_mark, obj_list); if (first_mark) highest_prio = first_mark->group->priority; if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) highest_prio = 0; /* * If the highest priority of group watching this object is prio, * then watched object has a reference on counters [0..prio]. * Update priority >= 1 watched objects counters. */ for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) atomic_long_inc(&sbinfo->watched_objects[p]); for (unsigned int p = conn->prio; p > highest_prio; p--) atomic_long_dec(&sbinfo->watched_objects[p]); conn->prio = highest_prio; /* Update priority >= 0 (a.k.a total) watched objects counter */ BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); if (first_mark && !is_watched) { conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_get_sb_watched_objects(sb); } else if (!first_mark && is_watched) { conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_put_sb_watched_objects(sb); } } /* * Grab or drop inode reference for the connector if needed. * * When it's time to drop the reference, we only clear the HAS_IREF flag and * return the inode object. fsnotify_drop_object() will be resonsible for doing * iput() outside of spinlocks. This happens when last mark that wanted iref is * detached. */ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, bool want_iref) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; struct inode *inode = NULL; if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == has_iref) return NULL; if (want_iref) { /* Pin inode if any mark wants inode refcount held */ fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; } else { /* Unpin inode after detach of last mark that wanted iref */ inode = fsnotify_conn_inode(conn); conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; } return inode; } static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) continue; new_mask |= fsnotify_calc_mask(mark); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) want_iref = true; } /* * We use WRITE_ONCE() to prevent silly compiler optimizations from * confusing readers not holding conn->lock with partial updates. */ WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask); return fsnotify_update_iref(conn, want_iref); } static bool fsnotify_conn_watches_children( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return false; return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); } static void fsnotify_conn_set_children_dentry_flags( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return; fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); } /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { bool update_children; if (!conn) return; spin_lock(&conn->lock); update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); /* * Set children's PARENT_WATCHED flags only if parent started watching. * When parent stops watching, we clear false positive PARENT_WATCHED * flags lazily in __fsnotify_parent(). */ if (update_children) fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ static void fsnotify_connector_destroy_workfn(struct work_struct *work) { struct fsnotify_mark_connector *conn, *free; spin_lock(&destroy_lock); conn = connector_destroy_list; connector_destroy_list = NULL; spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); while (conn) { free = conn; conn = conn->destroy_next; kmem_cache_free(fsnotify_mark_connector_cachep, free); } } static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; fsnotify_update_sb_watchers(sb, conn); return inode; } static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; if (WARN_ON_ONCE(!group)) return; group->ops->free_mark(mark); fsnotify_put_group(group); } /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } /* * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { struct super_block *sb = fsnotify_connector_sb(conn); /* Update watched objects after detaching mark */ if (sb) fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); queue_work(system_unbound_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object * list. Mark can be already removed from the list by now and on its way to be * destroyed once SRCU period ends. * * Also pin the group so it doesn't disappear under us. */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { if (!mark) return true; if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { /* mark is attached, group is still alive then */ atomic_inc(&mark->group->user_waits); spin_unlock(&mark->lock); return true; } spin_unlock(&mark->lock); fsnotify_put_mark(mark); } return false; } /* * Puts marks and wakes up group destruction if necessary. * * Pairs with fsnotify_get_mark_safe() */ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) { if (mark) { struct fsnotify_group *group = mark->group; fsnotify_put_mark(mark); /* * We abuse notification_waitq on group shutdown for waiting for * all marks pinned when waiting for userspace. */ if (atomic_dec_and_test(&group->user_waits) && group->shutdown) wake_up(&group->notification_waitq); } } bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) __releases(&fsnotify_mark_srcu) { int type; fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); goto fail; } } /* * Now that both marks are pinned by refcount in the inode / vfsmount * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; fail: for (type--; type >= 0; type--) fsnotify_put_mark_wake(iter_info->marks[type]); return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) __acquires(&fsnotify_mark_srcu) { int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } /* * Mark mark as detached, remove it from group list. Mark still stays in object * list until its last reference is dropped. Note that we rely on mark being * removed from group list before corresponding reference to it is dropped. In * particular we rely on mark->connector being valid while we hold * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; list_del_init(&mark->g_list); spin_unlock(&mark->lock); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } /* * Free fsnotify mark. The mark is actually only marked as being freed. The * freeing is actually happening only once last reference to the mark is * dropped from a workqueue which first waits for srcu period end. * * Caller must have a reference to the mark or be protected by * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark * is being freed. */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { fsnotify_group_lock(group); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. * * Fanotify supports different notification classes (reflected as priority of * notification group). Events shall be passed to notification groups in * decreasing priority order. To achieve this marks in notification lists for * inodes and vfsmounts are sorted so that priorities of corresponding groups * are descending. * * Furthermore correct handling of the ignore mask requires processing inode * and vfsmount marks of each group together. Using the group address as * further sort criterion provides a unique sorting order and thus we can * merge inode and vfsmount lists of marks in linear time and find groups * present in both lists. * * A return value of 1 signifies that b has priority over a. * A return value of 0 signifies that the two marks have to be handled together. * A return value of -1 signifies that a has priority over b. */ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) { if (a == b) return 0; if (!a) return 1; if (!b) return -1; if (a->priority < b->priority) return 1; if (a->priority > b->priority) return -1; if (a < b) return 1; return -1; } static int fsnotify_attach_info_to_sb(struct super_block *sb) { struct fsnotify_sb_info *sbinfo; /* sb info is freed on fsnotify_sb_delete() */ sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); if (!sbinfo) return -ENOMEM; /* * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() * will observe an initialized structure */ if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { /* Someone else created sbinfo for us */ kfree(sbinfo); } return 0; } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); if (!conn) return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->prio = 0; conn->type = obj_type; conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ kmem_cache_free(fsnotify_mark_connector_cachep, conn); } return 0; } /* * Get mark connector, make sure it is alive and return with its lock held. * This is for users that get connector pointer from inode or mount. Users that * hold reference to a mark on the list may directly lock connector->lock as * they are sure list cannot go away under them. */ static struct fsnotify_mark_connector *fsnotify_grab_connector( fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; int idx; idx = srcu_read_lock(&fsnotify_mark_srcu); conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (!conn) goto out; spin_lock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; } out: srcu_read_unlock(&fsnotify_mark_srcu, idx); return conn; } /* * Add mark into proper place in given list of marks. These marks may be used * for the fsnotify backend to determine which event types should be delivered * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ if (!fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; } connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; } /* is mark the first mark? */ if (hlist_empty(&conn->list)) { hlist_add_head_rcu(&mark->obj_list, &conn->list); goto added; } /* should mark be in the middle of the current list? */ hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone * seeing mark->connector set. */ WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); return err; } /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! * group->mark_mutex * mark->lock * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; fsnotify_recalc_mask(mark->connector); return ret; err: spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); fsnotify_put_mark(mark); return ret; } int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; if (!connp) return NULL; conn = fsnotify_grab_connector(connp); if (!conn) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->group == group && (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; } } spin_unlock(&conn->lock); return NULL; } EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our * to_free list so we have to use mark_mutex even when accessing that * list. And freeing mark requires us to drop mark_mutex. So we can * reliably free only the first mark in the list. That's why we first * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } fsnotify_group_unlock(group); clear: while (1) { fsnotify_group_lock(group); if (list_empty(head)) { fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } /* Destroy all marks attached to an object via connector */ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) return; /* * We have to be careful since we can race with e.g. * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the * list can get modified. However we are holding mark reference and * thus our mark cannot be removed from obj_list so we can continue * iteration after regaining conn->lock. */ hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); old_mark = mark; fsnotify_destroy_mark(mark, mark->group); spin_lock(&conn->lock); } /* * Detach list from object now so that we don't pin inode until all * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; WRITE_ONCE(mark->connector, NULL); } EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; spin_lock(&destroy_lock); /* exchange the list head */ list_replace_init(&destroy_list, &private_destroy_list); spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); fsnotify_final_mark_destroy(mark); } } /* Wait for all marks queued for destruction to be actually destroyed */ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE");
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static bool ip6mr_can_free_table(struct net *net) { return !check_net(net) || !net_initialized(net); } static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, id); rcu_read_unlock(); return mrt; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static bool ip6mr_can_free_table(struct net *net) { return !check_net(net); } static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } #define __ip6mr_get_table ip6mr_get_table static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); WARN_ON_ONCE(!ip6mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return ERR_PTR(-ENOENT); } iter->mrt = mrt; return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", atomic_long_read(&mfc->_c.mfc_un.res.pkt), atomic_long_read(&mfc->_c.mfc_un.res.bytes), atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_local = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(const struct net *net) { return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, .msgtype = RTM_GETROUTE, .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, }; int __init ip6_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_many(ip6mr_rtnl_msg_handlers); if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void __init ip6_mr_cleanup(void) { rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); goto out_free; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) goto out_free; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, vif_dev, ip6mr_forward2_finish); out_free: kfree_skb(skb); return 0; } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int err; struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return -ENOENT; } cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 307 308 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 13828 13829 13830 13831 13832 13833 13834 13835 13836 13837 13838 13839 13840 13841 13842 13843 13844 13845 13846 13847 13848 13849 13850 13851 13852 13853 13854 13855 13856 13857 13858 13859 13860 13861 13862 13863 13864 13865 13866 13867 13868 13869 13870 13871 13872 13873 13874 13875 13876 13877 13878 13879 13880 13881 13882 13883 13884 13885 13886 13887 13888 13889 13890 13891 13892 13893 13894 13895 13896 13897 13898 13899 13900 13901 13902 13903 13904 13905 13906 13907 13908 13909 13910 13911 13912 13913 13914 13915 13916 13917 13918 13919 13920 13921 13922 13923 13924 13925 13926 13927 13928 13929 13930 13931 13932 13933 13934 13935 13936 13937 13938 13939 13940 13941 13942 13943 13944 13945 13946 13947 13948 13949 13950 13951 13952 13953 13954 13955 13956 13957 13958 13959 13960 13961 13962 13963 13964 13965 13966 13967 13968 13969 13970 13971 13972 13973 13974 13975 13976 13977 13978 13979 13980 13981 13982 13983 13984 13985 13986 13987 13988 13989 13990 13991 13992 13993 13994 13995 13996 13997 13998 13999 14000 14001 14002 14003 14004 14005 14006 14007 14008 14009 14010 14011 14012 14013 14014 14015 14016 14017 14018 14019 14020 14021 14022 14023 14024 14025 14026 14027 14028 14029 14030 14031 14032 14033 14034 14035 14036 14037 14038 14039 14040 14041 14042 14043 14044 14045 14046 14047 14048 14049 14050 14051 14052 14053 14054 14055 14056 14057 14058 14059 14060 14061 14062 14063 14064 14065 14066 14067 14068 14069 14070 14071 14072 14073 14074 14075 14076 14077 14078 14079 14080 14081 14082 14083 14084 14085 14086 14087 14088 14089 14090 14091 14092 14093 14094 14095 14096 14097 14098 14099 14100 14101 14102 14103 14104 14105 14106 14107 14108 14109 14110 14111 14112 14113 14114 14115 14116 14117 14118 14119 14120 14121 14122 14123 14124 14125 14126 14127 14128 14129 14130 14131 14132 14133 14134 14135 14136 14137 14138 14139 14140 14141 14142 14143 14144 14145 14146 14147 14148 14149 14150 14151 14152 14153 14154 14155 14156 14157 14158 14159 14160 14161 14162 14163 14164 14165 14166 14167 14168 14169 14170 14171 14172 14173 14174 14175 14176 14177 14178 14179 14180 14181 14182 14183 14184 14185 14186 14187 14188 14189 14190 14191 14192 14193 14194 14195 14196 14197 14198 14199 14200 14201 14202 14203 14204 14205 14206 14207 14208 14209 14210 14211 14212 14213 14214 14215 14216 14217 14218 14219 14220 14221 14222 14223 14224 14225 14226 14227 14228 14229 14230 14231 14232 14233 14234 14235 14236 14237 14238 14239 14240 14241 14242 14243 14244 14245 14246 14247 14248 14249 14250 14251 14252 14253 14254 14255 14256 14257 14258 14259 14260 14261 14262 14263 14264 14265 14266 14267 14268 14269 14270 14271 14272 14273 14274 14275 14276 14277 14278 14279 14280 14281 14282 14283 14284 14285 14286 14287 14288 14289 14290 14291 14292 14293 14294 14295 14296 14297 14298 14299 14300 14301 14302 14303 14304 14305 14306 14307 14308 14309 14310 14311 14312 14313 14314 14315 14316 14317 14318 14319 14320 14321 14322 14323 14324 // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } enum event_type_t { EVENT_FLEXIBLE = 0x01, EVENT_PINNED = 0x02, EVENT_TIME = 0x04, EVENT_FROZEN = 0x08, /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, }; static inline void __perf_ctx_lock(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __perf_ctx_lock(&cpuctx->ctx); if (ctx) __perf_ctx_lock(ctx); } static inline void __perf_ctx_unlock(struct perf_event_context *ctx) { /* * If ctx_sched_in() didn't again set any ALL flags, clean up * after ctx_sched_out() by clearing is_active. */ if (ctx->is_active & EVENT_FROZEN) { if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; else ctx->is_active &= ~EVENT_FROZEN; } raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) __perf_ctx_unlock(ctx); __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); struct perf_event_context *perf_cpu_task_ctx(void) { lockdep_assert_irqs_disabled(); return this_cpu_ptr(&perf_cpu_context)->task_ctx; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; local_irq_disable(); cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (ctx->is_active) { perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); goto again; } func(event, NULL, ctx, data); unlock: perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) /* * perf_sched_events : >0 events exist */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; static cpumask_var_t perf_online_core_mask; static cpumask_var_t perf_online_die_mask; static cpumask_var_t perf_online_cluster_mask; static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ if (_cgroup && !_epc->nr_cgroups) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ if (info->active) __update_cgrp_time(info, perf_clock(), true); } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!cgrp) return; WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; /* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled. */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out() */ cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; CLASS(fd, f)(fd); int ret = 0; if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups++; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (ctx->nr_cgroups++) return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups--; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } static void perf_cgroup_switch(struct task_struct *task) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); rotations = perf_rotate_context(cpc); raw_spin_lock(&cpc->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpc->hrtimer_interval); else cpc->hrtimer_active = 0; raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { struct hrtimer *timer = &cpc->hrtimer; struct pmu *pmu = cpc->epc.pmu; u64 interval; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); if (!cpc->hrtimer_active) { cpc->hrtimer_active = 1; hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void *alloc_task_ctx_data(struct pmu *pmu) { if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * mmap_lock * perf_event::mmap_mutex * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); lockdep_assert_held(&ctx->lock); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } static inline struct cgroup *event_cgroup(const struct perf_event *event) { struct cgroup *cgroup = NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp) cgroup = event->cgrp->css.cgroup; #endif return cgroup; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static __always_inline int perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, const struct cgroup *left_cgroup, const u64 left_group_index, const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1; } #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first. */ return -1; } if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first. */ return 1; } /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1; return 1; } } #endif if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1; return 0; } #define __node_2_pe(node) \ rb_entry((node), struct perf_event, group_node) static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; struct pmu *pmu; struct cgroup *cgroup; }; static inline int __group_cmp(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); } static inline int __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), b->group_index, b); } /* * Insert @event into @groups' tree; using * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { event->group_index = ++groups->index; rb_add(&event->group_node, &groups->tree, __group_less); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; node = rb_find_first(&key, &groups->tree, __group_cmp); if (node) return __node_2_pe(node); return NULL; } static struct perf_event * perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; next = rb_next_match(&key, &event->group_node, __group_cmp); if (next) return __node_2_pe(next); return NULL; } #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ event; event = perf_event_groups_next(event, pmu)) /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Does the event attribute request inherit with PERF_SAMPLE_READ */ static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) { return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); } /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; if (has_inherit_and_sample_read(&event->attr)) local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; event->pmu_ctx->nr_events++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } /* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow. */ return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) size += sizeof(data->code_page_size); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { event->read_size = __perf_event_read_size(event->attr.read_format, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } /* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets. */ static bool perf_event_validate_size(struct perf_event *event) { struct perf_event *sibling, *group_leader = event->group_leader; if (__perf_event_read_size(event->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; if (__perf_event_read_size(group_leader->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; /* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings. */ if (event == group_leader) return true; for_each_sibling_event(sibling, group_leader) { if (__perf_event_read_size(sibling->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; } return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement (move_group) in * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; group_leader->group_generation++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; if (has_inherit_and_sample_read(&event->attr)) local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); /* * If event was in error state, then keep it * that way, otherwise bogus counts will be * returned on read(). The only way to get out * of error state is by explicit re-enabling * of the event */ if (event->state > PERF_EVENT_STATE_OFF) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); } ctx->generation++; event->pmu_ctx->nr_events--; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event->group_leader) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if ((event->attr.aux_pause || event->attr.aux_resume) && !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { return event->attr.pinned ? &event->pmu_ctx->pinned_active : &event->pmu_ctx->flexible_active; } /* * Events that have PERF_EV_CAP_SIBLING require being part of a group and * cannot exist on their own, schedule them out and move them into the ERROR * state. Also see _perf_event_enable(), it will not be able to recover * this ERROR state. */ static inline void perf_remove_sibling_event(struct perf_event *event) { event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; event->group_leader->group_generation++; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static void sync_child_event(struct perf_event *child_event); static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; if (!(event->attach_state & PERF_ATTACH_CHILD)) return; event->attach_state &= ~PERF_ATTACH_CHILD; if (WARN_ON_ONCE(!parent_event)) return; lockdep_assert_held(&parent_event->child_mutex); sync_child_event(event); list_del_init(&event->child_list); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event); } static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (event->pending_disable) { event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq--; epc->nr_freq--; } if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, ctx); } static inline void __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __ctx_time_update(cpuctx, ctx, false); } /* * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). */ static inline void ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { ctx_time_update(cpuctx, ctx); if (ctx->is_active & EVENT_TIME) ctx->is_active |= EVENT_FROZEN; } static inline void ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_event(event); } } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ if (flags & DETACH_DEAD) event->pending_disable = 1; event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); if (flags & DETACH_DEAD) event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); /* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) { __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); else event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); perf_pmu_enable(event->pmu_ctx->pmu); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { perf_log_throttle(event, 1); event->hw.interrupts = 0; } perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq++; epc->nr_freq++; } if (event->attr.exclusive) cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, ctx); } event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void task_ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct pmu *pmu) { ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) ctx_sched_in(ctx, pmu, EVENT_PINNED); ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; event_type &= EVENT_ALL; for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_disable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); } /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx, pmu); for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_enable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_enable(epc->pmu); } } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) return; /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_event_refresh(struct perf_event *event, int refresh) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_refresh(event, refresh); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_refresh); static int perf_event_modify_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); if (!bp->attr.disabled) _perf_event_enable(bp); return err; } /* * Copy event-type-independent attributes that may be modified. */ static void perf_event_modify_copy_attr(struct perf_event_attr *to, const struct perf_event_attr *from) { to->sig_data = from->sig_data; } static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { int (*func)(struct perf_event *, struct perf_event_attr *); struct perf_event *child; int err; if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: func = perf_event_modify_breakpoint; break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); /* * Event-type-independent attributes must be copied before event-type * modification, which will validate that final attributes match the * source attributes after all relevant attributes have been copied. */ perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; } out: mutex_unlock(&event->child_mutex); return err; } static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); if (event_type & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) group_sched_out(event, ctx); } if (event_type & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) group_sched_out(event, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed. */ pmu_ctx->rotate_necessary = 0; } perf_pmu_enable(pmu); } /* * Be very careful with the @pmu argument since this will change ctx state. * The @pmu argument works for ctx_resched(), because that is symmetric in * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. * * However, if you were to be asymmetrical, you could end up with messed up * state, eg. ctx->is_active cleared even though most EPCs would still actually * be active. */ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context(). */ WARN_ON_ONCE(ctx->is_active); if (ctx->task) WARN_ON_ONCE(cpuctx->task_ctx); return; } /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events. */ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) { /* * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() * does not observe a hole. perf_ctx_unlock() will clean up. */ if (ctx->is_active & EVENT_FROZEN) ctx->is_active &= EVENT_TIME_FROZEN; else ctx->is_active = 0; } if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); } /* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { lockdep_assert_held(&ctx1->lock); lockdep_assert_held(&ctx2->lock); /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; /* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1; /* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1; /* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context(). */ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen) return 1; /* Unmatched */ return 0; } static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { u64 value; if (!event->attr.inherit_stat) return; /* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it. */ if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ value = local64_read(&next_event->count); value = local64_xchg(&event->count, value); local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ perf_event_update_userpage(event); perf_event_update_userpage(next_event); } static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; if (!ctx->nr_stat) return; update_context_time(ctx); event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); next_event = list_first_entry(&next_ctx->event_list, struct perf_event, event_entry); while (&event->event_entry != &ctx->event_list && &next_event->event_entry != &next_ctx->event_list) { __perf_event_sync_stat(event, next_event); event = list_next_entry(event, event_entry); next_event = list_next_entry(next_event, event_entry); } } #define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ pos2 = list_first_entry(head2, typeof(*pos2), member); \ !list_entry_is_head(pos1, head1, member) && \ !list_entry_is_head(pos2, head2, member); \ pos1 = list_next_entry(pos1, member), \ pos2 = list_next_entry(pos2, member)) static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, struct perf_event_context *next_ctx) { struct perf_event_pmu_context *prev_epc, *next_epc; if (!prev_ctx->nr_task_data) return; double_list_for_each_entry(prev_epc, next_epc, &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, pmu_ctx_entry) { if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) continue; /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such * synchronization see implementation details of Intel * LBR call stack data profiling; */ if (prev_epc->pmu->swap_task_ctx) prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); else swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); } } static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); } } static void perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; int do_switch = 1; if (likely(!ctx)) return; rcu_read_lock(); next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; parent = rcu_dereference(ctx->parent_ctx); next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. * * Likewise, when a context contains inherit + * SAMPLE_READ events they should be switched * out using the slow path so that they are * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); goto inside_switch; } WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task and ctx->task_ctx_data are immaterial * since those values are always verified under * ctx->lock which we're now holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; perf_event_sync_stat(ctx, next_ctx); } raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } unlock: rcu_read_unlock(); if (do_switch) { raw_spin_lock(&ctx->lock); perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); barrier(); if (!--cpc->sched_cb_usage) list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); barrier(); this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; pmu = cpc->epc.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_pmu_context *cpc; /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ if (prev == next || cpuctx->task_ctx) return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event. */ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { .less = perf_less_group_idx, .swp = NULL, }; static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; if (event) { itrs[heap->nr] = event; heap->nr++; } } static void __link_epc(struct perf_event_pmu_context *pmu_ctx) { struct perf_cpu_pmu_context *cpc; if (!pmu_ctx->ctx->task) return; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct perf_event_min_heap event_heap; struct perf_event **evt; int ret; if (pmu->filter && pmu->filter(pmu, cpu)) return 0; if (!ctx->task) { cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct perf_event_min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, }; lockdep_assert_held(&cpuctx->ctx.lock); #ifdef CONFIG_CGROUP_PERF if (cpuctx->cgrp) css = &cpuctx->cgrp->css; #endif } else { event_heap = (struct perf_event_min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif if (event_heap.nr) { __link_epc((*evt)->pmu_ctx); perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; } /* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges. */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); perf_event_update_userpage(event); return true; } static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; if (!event_update_userpage(group_event)) return; for_each_sibling_event(event, group_event) event_update_userpage(event); } static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; if (!event_filter_match(event)) return 0; if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { struct perf_cpu_pmu_context *cpc; event->pmu_ctx->rotate_necessary = 1; cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } return 0; } static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, struct pmu *pmu) { int can_add_hw = 1; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); if (event_type & EVENT_FLEXIBLE) pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; if (!(is_active & EVENT_TIME)) { /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); } is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); } } static void perf_event_context_sched_in(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto rcu_unlock; if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context(). */ if (!ctx->nr_events) goto unlock; perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); rcu_unlock: rcu_read_unlock(); } /* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running. */ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; u64 sec = NSEC_PER_SEC; u64 divisor, dividend; int count_fls, nsec_fls, frequency_fls, sec_fls; count_fls = fls64(count); nsec_fls = fls64(nsec); frequency_fls = fls64(frequency); sec_fls = 30; /* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq * */ /* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ a##_fls--; \ } else { \ b >>= 1; \ b##_fls--; \ } \ } while (0) /* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division. */ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); REDUCE_FLS(sec, count); } if (count_fls + sec_fls > 64) { divisor = nsec * frequency; while (count_fls + sec_fls > 64) { REDUCE_FLS(count, sec); divisor >>= 1; } dividend = count * sec; } else { dividend = count * sec; while (nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); dividend >>= 1; } divisor = nsec * frequency; } if (!divisor) return dividend; return div64_u64(dividend, divisor); } static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); if (delta >= 0) delta += 7; else delta -= 7; delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { if (disable) event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); if (disable) event->pmu->start(event, PERF_EF_RELOAD); } } static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); if (!event->attr.freq || !event->attr.sample_freq) event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) continue; /* * stop the event and update event->count */ event->pmu->stop(event, PERF_EF_UPDATE); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; /* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice. */ if (delta > 0) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } } /* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event_pmu_context *pmu_ctx; /* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (!(pmu_ctx->nr_freq || unthrottle)) continue; if (!perf_pmu_ctx_is_active(pmu_ctx)) continue; if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) continue; perf_pmu_disable(pmu_ctx->pmu); perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); perf_pmu_enable(pmu_ctx->pmu); } raw_spin_unlock(&ctx->lock); } /* * Move @event to the tail of the @ctx's elegible events. */ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (ctx->rotate_disable) return; perf_event_groups_delete(&ctx->flexible_groups, event); perf_event_groups_insert(&ctx->flexible_groups, event); } /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; struct rb_node *node; struct rb_root *tree; struct __group_key key = { .pmu = pmu_ctx->pmu, }; /* pick the first active flexible event */ event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); if (event) goto out; /* if no active flexible event, pick the first event */ tree = &pmu_ctx->ctx->flexible_groups.tree; if (!pmu_ctx->ctx->task) { key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); goto out; } key.cpu = -1; node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) { event = __node_2_pe(node); goto out; } key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ pmu_ctx->rotate_necessary = 0; return event; } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; int cpu_rotate, task_rotate; struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ cpu_epc = &cpc->epc; pmu = cpu_epc->pmu; task_epc = cpc->task_epc; cpu_rotate = cpu_epc->rotate_necessary; task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); if (task_rotate) task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ if (task_event || (task_epc && cpu_event)) { update_context_time(task_epc->ctx); __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); } if (cpu_event) { update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; } void perf_event_task_tick(void) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_adjust_freq_unthr_context(ctx, !!throttled); rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, struct perf_event_context *ctx) { if (!event->attr.enable_on_exec) return 0; event->attr.enable_on_exec = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; if (!ctx->nr_events) goto out; cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); ctx_time_freeze(cpuctx, ctx); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); out: local_irq_restore(flags); if (clone_ctx) put_ctx(clone_ctx); } static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx); /* * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ static void perf_event_remove_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) goto unlock; list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { if (!event->attr.remove_on_exec) continue; if (!is_kernel_event(event)) perf_remove_from_owner(event); modified = true; perf_event_exit_event(event, ctx); } raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); if (clone_ctx) put_ctx(clone_ctx); } struct perf_read_data { struct perf_event *event; bool group; int ret; }; static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) return local_cpu; } if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); if (event_pkg == local_pkg) return local_cpu; } return event_cpu; } /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case * event->count would have been updated to a recent sample * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; raw_spin_lock(&ctx->lock); ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; if (!data->group) { pmu->read(event); data->ret = 0; goto unlock; } pmu->start_txn(pmu, PERF_PMU_TXN_READ); pmu->read(event); for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since * sibling could be on different (eg: software) PMU. */ sub->pmu->read(sub); } } data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event, bool self) { if (self) return local64_read(&event->count); return local64_read(&event->count) + atomic64_read(&event->child_count); } static void calc_timer_values(struct perf_event *event, u64 *now, u64 *enabled, u64 *running) { u64 ctx_time; *now = perf_clock(); ctx_time = perf_event_time_now(event, *now); __perf_update_times(event, ctx_time, enabled, running); } /* * NMI-safe method to read a local event, that is an event that * is: * - either for the current task, or for this CPU * - does not have inherit set, for inherited task events * will not be local and we cannot read them atomically * - must not have a pmu::count method */ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; int event_oncpu; int event_cpu; int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs). */ local_irq_save(flags); /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ if (event->attr.inherit) { ret = -EOPNOTSUPP; goto out; } /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { ret = -EINVAL; goto out; } /* * Get the event CPU numbers, and adjust them to local if the event is * a per-package event that can be read locally */ event_oncpu = __perf_event_read_cpu(event, event->oncpu); event_cpu = __perf_event_read_cpu(event, event->cpu); /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); if (enabled || running) { u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) *running = __running; } out: local_irq_restore(flags); return ret; } static int perf_event_read(struct perf_event *event, bool group) { enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ again: if (state == PERF_EVENT_STATE_ACTIVE) { struct perf_read_data data; /* * Orders the ->state and ->oncpu loads such that if we see * ACTIVE we must also see the right ->oncpu. * * Matches the smp_wmb() from event_sched_in(). */ smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; data = (struct perf_read_data){ .event = event, .group = group, .ret = 0, }; preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); /* * Purposely ignore the smp_call_function_single() return * value. * * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this. */ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); state = event->state; if (state != PERF_EVENT_STATE_INACTIVE) { raw_spin_unlock_irqrestore(&ctx->lock, flags); goto again; } /* * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ret; } /* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); refcount_set(&ctx->refcount, 1); } static void __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) { epc->pmu = pmu; INIT_LIST_HEAD(&epc->pmu_ctx_entry); INIT_LIST_HEAD(&epc->pinned_active); INIT_LIST_HEAD(&epc->flexible_active); atomic_set(&epc->refcount, 1); } static struct perf_event_context * alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!ctx) return NULL; __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); return ctx; } static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; rcu_read_lock(); if (!vpid) task = current; else task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) return ERR_PTR(-ESRCH); return task; } /* * Returns a matching context with refcount and pincount. */ static struct perf_event_context * find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; unsigned long flags; int err; if (!task) { /* Must be root to operate on a CPU event: */ err = perf_allow_cpu(&event->attr); if (err) return ERR_PTR(err); cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; err = 0; mutex_lock(&task->perf_event_mutex); /* * If it has already passed perf_event_exit_task(). * we must see PF_EXITING, it takes this mutex too. */ if (task->flags & PF_EXITING) err = -ESRCH; else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { put_ctx(ctx); if (err == -EAGAIN) goto retry; goto errout; } } return ctx; errout: return ERR_PTR(err); } static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { struct perf_event_pmu_context *new = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { /* * perf_pmu_migrate_context() / __perf_pmu_install_event() * relies on the fact that find_get_pmu_context() cannot fail * for CPU contexts. */ struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } raw_spin_unlock_irq(&ctx->lock); return epc; } new = kzalloc(sizeof(*epc), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); if (event->attach_state & PERF_ATTACH_TASK_DATA) { task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { kfree(new); return ERR_PTR(-ENOMEM); } } __perf_init_event_pmu_context(new, pmu); /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because perf_event_init_task() doesn't actually hold the * child_ctx->mutex. */ raw_spin_lock_irq(&ctx->lock); list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (epc->pmu == pmu) { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); goto found_epc; } } epc = new; new = NULL; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; found_epc: if (task_ctx_data && !epc->task_ctx_data) { epc->task_ctx_data = task_ctx_data; task_ctx_data = NULL; ctx->nr_task_data++; } raw_spin_unlock_irq(&ctx->lock); free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; } static void get_pmu_ctx(struct perf_event_pmu_context *epc) { WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); kfree(epc->task_ctx_data); kfree(epc); } static void put_pmu_ctx(struct perf_event_pmu_context *epc) { struct perf_event_context *ctx = epc->ctx; unsigned long flags; /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because of the call-site in _free_event()/put_event() * which isn't always called under ctx->mutex. */ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); raw_spin_unlock_irqrestore(&ctx->lock, flags); if (epc->embedded) return; call_rcu(&epc->rcu_head, free_epc_rcu); } static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { struct perf_event *event = container_of(head, typeof(*event), rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_del_rcu(&event->sb_list); raw_spin_unlock(&pel->lock); } static bool is_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; if (event->parent) return false; if (event->attach_state & PERF_ATTACH_TASK) return false; if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; } static void unaccount_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) detach_sb_event(event); } #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif static void unaccount_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL spin_lock(&nr_freq_lock); if (atomic_dec_and_test(&nr_freq_events)) tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void unaccount_freq_event(void) { if (tick_nohz_full_enabled()) unaccount_freq_event_nohz(); else atomic_dec(&nr_freq_events); } static void unaccount_event(struct perf_event *event) { bool dec = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.build_id) atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); if (event->attr.cgroup) atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) dec = true; if (has_branch_stack(event)) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); if (event->attr.text_poke) atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) schedule_delayed_work(&perf_sched_work, HZ); } unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) { mutex_lock(&perf_sched_mutex); if (atomic_dec_and_test(&perf_sched_count)) static_branch_disable(&perf_sched_events); mutex_unlock(&perf_sched_mutex); } /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled * at a time, so we disallow creating events that might conflict, namely: * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return 0; /* * Prevent co-existence of per-task and cpu-wide events on the * same exclusive pmu. * * Negative pmu::exclusive_cnt means there are cpu-wide * events on this "exclusive" pmu, positive means there are * per-task events. * * Since this is called in perf_event_alloc() path, event::ctx * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK * to mean "per-task event", because unlike other attach states it * never gets cleared. */ if (event->attach_state & PERF_ATTACH_TASK) { if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) return -EBUSY; } else { if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) return -EBUSY; } return 0; } static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) return true; return false; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; lockdep_assert_held(&ctx->mutex); if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { if (exclusive_event_match(iter_event, event)) return false; } return true; } static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); static void perf_pending_task_sync(struct perf_event *event) { struct callback_head *head = &event->pending_task; if (!event->pending_work) return; /* * If the task is queued to the current task's queue, we * obviously can't wait for it to complete. Simply cancel it. */ if (task_work_cancel(current, head)) { event->pending_work = 0; local_dec(&event->ctx->nr_no_switch_fast); return; } /* * All accesses related to the event are within the same RCU section in * perf_pending_task(). The RCU grace period before the event is freed * will make sure all those accesses are complete by then. */ rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); perf_pending_task_sync(event); unaccount_event(event); security_perf_event_free(event); if (event->rb) { /* * Can happen when we close an event with re-directed output. * * Since we have a 0 refcount, perf_mmap_close() will skip * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) perf_detach_cgroup(event); if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); /* * Must be after ->destroy(), due to uprobe_perf_close() using * hw.target. */ if (event->hw.target) put_task_struct(event->hw.target); if (event->pmu_ctx) put_pmu_ctx(event->pmu_ctx); /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); exclusive_event_destroy(event); module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, "unexpected event refcount: %ld; ptr=%p\n", atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } _free_event(event); } /* * Remove user event from the owner task. */ static void perf_remove_from_owner(struct perf_event *event) { struct task_struct *owner; rcu_read_lock(); /* * Matches the smp_store_release() in perf_event_exit_task(). If we * observe !owner it means the list deletion is complete and we can * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last * task reference we can safely take a new reference * while holding the rcu_read_lock(). */ get_task_struct(owner); } rcu_read_unlock(); if (owner) { /* * If we're here through perf_event_exit_task() we're already * holding ctx->mutex which would be an inversion wrt. the * normal lock order. * * However we can safely take this lock because its the child * ctx->mutex. */ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex * ensured they're done, and we can proceed with freeing the * event. */ if (event->owner) { list_del_init(&event->owner_entry); smp_store_release(&event->owner, NULL); } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } } static void put_event(struct perf_event *event) { if (!atomic_long_dec_and_test(&event->refcount)) return; _free_event(event); } /* * Kill an event dead; while event:refcount will preserve the event * object, it will not preserve its functionality. Once the last 'user' * gives up the object, we'll destroy the thing. */ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); goto no_ctx; } if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ * also see this, most importantly inherit_event() which will avoid * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { void *var = NULL; /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. * * Since the event cannot get freed while we hold the * child_mutex, the context must also exist and have a !0 * reference count. */ get_ctx(ctx); /* * Now that we have a ctx ref, we can drop child_mutex, and * acquire ctx::mutex without fear of it going away. Then we * can re-acquire child_mutex. */ mutex_unlock(&event->child_mutex); mutex_lock(&ctx->mutex); mutex_lock(&event->child_mutex); /* * Now that we hold ctx::mutex and child_mutex, revalidate our * state, if child is still the first entry, it didn't get freed * and we can continue doing so. */ tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. */ put_event(event); } else { var = &ctx->refcount; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); if (var) { /* * If perf_event_free_task() has deleted all events from the * ctx while the child_mutex got released above, make sure to * notify about the preceding put_ctx(). */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } goto again; } mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { void *var = &child->ctx->refcount; list_del(&child->child_list); free_event(child); /* * Wake any perf_event_free_task() waiting for this event to be * freed. */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static int perf_release(struct inode *inode, struct file *file) { perf_event_release_kernel(file->private_data); return 0; } static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; *enabled = 0; *running = 0; mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } mutex_unlock(&event->child_mutex); return total; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); count = __perf_event_read_value(event, enabled, running); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; ret = perf_event_read(leader, true); if (ret) return ret; raw_spin_lock_irqsave(&ctx->lock, flags); /* * Verify the grouping between the parent and child (inherited) * events is still in tact. * * Specifically: * - leader->ctx->lock pins leader->sibling_list * - parent->child_mutex pins parent->child_list * - parent->ctx->mutex pins parent->sibling_list * * Because parent->ctx != leader->ctx (and child_list nests inside * ctx->mutex), group destruction is not atomic between children, also * see perf_event_release_kernel(). Additionally, parent can grow the * group. * * Therefore it is possible to have parent and child groups in a * different configuration and summing over such a beast makes no sense * what so ever. * * Reject this. */ parent = leader->parent; if (parent && (parent->group_generation != leader->group_generation || parent->nr_siblings != leader->nr_siblings)) { ret = -ECHILD; goto unlock; } /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one * set. */ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] += leader->total_time_enabled + atomic64_read(&leader->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] += leader->total_time_running + atomic64_read(&leader->child_total_time_running); } /* * Write {count,id} tuples for every sibling. */ values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); } unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); return ret; } static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; int ret; u64 *values; lockdep_assert_held(&ctx->mutex); values = kzalloc(event->read_size, GFP_KERNEL); if (!values) return -ENOMEM; values[0] = 1 + leader->nr_siblings; mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); if (ret) goto unlock; list_for_each_entry(child, &leader->child_list, child_list) { ret = __perf_read_group_add(child, read_format, values); if (ret) goto unlock; } mutex_unlock(&leader->child_mutex); ret = event->read_size; if (copy_to_user(buf, values, event->read_size)) ret = -EFAULT; goto out; unlock: mutex_unlock(&leader->child_mutex); out: kfree(values); return ret; } static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; return n * sizeof(u64); } static bool is_event_hup(struct perf_event *event) { bool no_children; if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); no_children = list_empty(&event->child_list); mutex_unlock(&event->child_mutex); return no_children; } /* * Read the performance event - simple non blocking version for now */ static ssize_t __perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; /* * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ if (event->state == PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) ret = perf_read_group(event, read_format, buf); else ret = perf_read_one(event, read_format, buf); return ret; } static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; int ret; ret = security_perf_event_read(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; } static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); if (is_event_hup(event)) return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); rb = event->rb; if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); return events; } static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } /* Assume it's not an event with inherit set. */ u64 perf_event_pause(struct perf_event *event, bool reset) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(event->attr.inherit); _perf_event_disable(event); count = local64_read(&event->count); if (reset) local64_set(&event->count, 0); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_pause); /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) func(child); mutex_unlock(&event->child_mutex); } static void perf_event_for_each(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; lockdep_assert_held(&ctx->mutex); event = event->group_leader; perf_event_for_each_child(event, func); for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } static void __perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { u64 value = *((u64 *)info); bool active; if (event->attr.freq) { event->attr.sample_freq = value; } else { event->attr.sample_period = value; event->hw.sample_period = value; } active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. */ if (event->hw.interrupts == MAX_INTERRUPTS) { event->hw.interrupts = 0; perf_log_throttle(event, 1); } event->pmu->stop(event, PERF_EF_UPDATE); } local64_set(&event->hw.period_left, 0); if (active) { event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(event->pmu); } } static int perf_event_check_period(struct perf_event *event, u64 value) { return event->pmu->check_period(event, value); } static int _perf_event_period(struct perf_event *event, u64 value) { if (!is_sampling_event(event)) return -EINVAL; if (!value) return -EINVAL; if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; if (perf_event_check_period(event, value)) return -EINVAL; if (!event->attr.freq && (value & (1ULL << 63))) return -EINVAL; event_function_call(event, __perf_event_period, &value); return 0; } int perf_event_period(struct perf_event *event, u64 value) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_period(event, value); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; static inline bool is_perf_file(struct fd f) { return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: { u64 value; if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) return -EFAULT; return _perf_event_period(event, value); } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); if (copy_to_user((void __user *)arg, &id, sizeof(id))) return -EFAULT; return 0; } case PERF_EVENT_IOC_SET_OUTPUT: { CLASS(fd, output)(arg); // arg == -1 => empty struct perf_event *output_event = NULL; if (arg != -1) { if (!is_perf_file(output)) return -EBADF; output_event = fd_file(output)->private_data; } return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: { struct bpf_prog *prog; int err; prog = bpf_prog_get(arg); if (IS_ERR(prog)) return PTR_ERR(prog); err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; } return 0; } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb || !rb->nr_pages) { rcu_read_unlock(); return -EINVAL; } rb_toggle_paused(rb, !!arg); rcu_read_unlock(); return 0; } case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { struct perf_event_attr new_attr; int err = perf_copy_attr((struct perf_event_attr __user *)arg, &new_attr); if (err) return err; return perf_event_modify_attr(event, &new_attr); } default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) perf_event_for_each(event, func); else perf_event_for_each_child(event, func); return 0; } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; long ret; /* Treat ioctl like writes as it is likely a mutating operation. */ ret = security_perf_event_write(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); return ret; } #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; cmd |= sizeof(void *) << IOCSIZE_SHIFT; } break; } return perf_ioctl(file, cmd, arg); } #else # define perf_compat_ioctl NULL #endif int perf_event_task_enable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_enable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } int perf_event_task_disable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_disable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) return 0; if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; return event->pmu->event_idx(event); } static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; userpg = rb->user_page; /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); userpg->data_offset = PAGE_SIZE; userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); } void __weak arch_perf_update_userpage( struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we can be called in * NMI context */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); userpg->time_running = running + atomic64_read(&event->child_total_time_running); arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; preempt_enable(); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(perf_event_update_userpage); static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { struct perf_buffer *old_rb = NULL; unsigned long flags; WARN_ON_ONCE(event->parent); if (event->rb) { /* * Should be impossible, we set this when removing * event->rb_entry and wait/clear when adding event->rb_entry. */ WARN_ON_ONCE(event->rcu_pending); old_rb = event->rb; spin_lock_irqsave(&old_rb->event_lock, flags); list_del_rcu(&event->rb_entry); spin_unlock_irqrestore(&old_rb->event_lock, flags); event->rcu_batches = get_state_synchronize_rcu(); event->rcu_pending = 1; } if (rb) { if (event->rcu_pending) { cond_synchronize_rcu(event->rcu_batches); event->rcu_pending = 0; } spin_lock_irqsave(&rb->event_lock, flags); list_add_rcu(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } /* * Avoid racing with perf_mmap_close(AUX): stop the event * before swizzling the event::rb pointer; if it's getting * unmapped, its aux_mmap_count will be 0 and it won't * restart. See the comment in __perf_pmu_output_stop(). * * Data will inevitably be lost when set_output is done in * mid-air, but then again, whoever does it like this is * not in for the data anyway. */ if (has_aux(event)) perf_event_stop(event, 0); rcu_assign_pointer(event->rb, rb); if (old_rb) { ring_buffer_put(old_rb); /* * Since we detached before setting the new rb, so that we * could attach the new rb, we could have missed a wakeup. * Provide it now. */ wake_up_all(&event->waitq); } } static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { list_for_each_entry_rcu(event, &rb->event_list, rb_entry) wake_up_all(&event->waitq); } rcu_read_unlock(); } struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); return rb; } void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). * * In order to undo the VM accounting done by perf_mmap() we need to destroy * the buffer here, where we still have a VM context. This means we need * to detach all events redirecting to us. */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU * data. Note that after rb::aux_mmap_count dropped to zero, * they won't start any more (see perf_aux_output_begin()). */ perf_pmu_output_stop(event); /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&rb->aux_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ if (!detach_rest) goto out_put; /* * No other mmap()s, detach from all other events that might redirect * into the now unreachable buffer. Somewhat complicated by the * fact that rb::event_lock otherwise nests inside mmap_mutex. */ again: rcu_read_lock(); list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { if (!atomic_long_inc_not_zero(&event->refcount)) { /* * This event is en-route to free_event() which will * detach it and remove it from the list. */ continue; } rcu_read_unlock(); mutex_lock(&event->mmap_mutex); /* * Check we didn't race with perf_event_set_output() which can * swizzle the rb from under us while we were waiting to * acquire mmap_mutex. * * If we find a different rb; ignore this event, a next * iteration will no longer find it on the list. We have to * still restart the iteration to make sure we're not now * iterating the wrong list. */ if (event->rb == rb) ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); put_event(event); /* * Restart the iteration; either we're on the wrong list or * destroyed its integrity by doing a deletion. */ goto again; } rcu_read_unlock(); /* * It could be there's still a few 0-ref events on the list; they'll * get cleaned up by free_event() -- they'll also still have their * ref on the rb and will free it whenever they are done with it. * * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: ring_buffer_put(rb); /* could be last */ } static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) { /* The first page is the user control page, others are read-only. */ return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; } static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) { unsigned long nr_pages = vma_pages(vma); int err = 0; unsigned long pagenum; /* * We map this as a VM_PFNMAP VMA. * * This is not ideal as this is designed broadly for mappings of PFNs * referencing memory-mapped I/O ranges or non-system RAM i.e. for which * !pfn_valid(pfn). * * We are mapping kernel-allocated memory (memory we manage ourselves) * which would more ideally be mapped using vm_insert_page() or a * similar mechanism, that is as a VM_MIXEDMAP mapping. * * However this won't work here, because: * * 1. It uses vma->vm_page_prot, but this field has not been completely * setup at the point of the f_op->mmp() hook, so we are unable to * indicate that this should be mapped CoW in order that the * mkwrite() hook can be invoked to make the first page R/W and the * rest R/O as desired. * * 2. Anything other than a VM_PFNMAP of valid PFNs will result in * vm_normal_page() returning a struct page * pointer, which means * vm_ops->page_mkwrite() will be invoked rather than * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping * to work around retry logic in the fault handler, however this * field is no longer allowed to be used within struct page. * * 3. Having a struct page * made available in the fault logic also * means that the page gets put on the rmap and becomes * inappropriately accessible and subject to map and ref counting. * * Ideally we would have a mechanism that could explicitly express our * desires, but this is not currently the case, so we instead use * VM_PFNMAP. * * We manage the lifetime of these mappings with internal refcounts (see * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of * this mapping is maintained correctly. */ for (pagenum = 0; pagenum < nr_pages; pagenum++) { unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); if (page == NULL) { err = -EINVAL; break; } /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); if (err) break; } #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); #endif return err; } static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; ret = security_perf_event_read(event); if (ret) return ret; vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { nr_pages = (vma_size / PAGE_SIZE) - 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already * mapped, all subsequent mappings should have the same size * and offset. Must be above the normal perf buffer. */ u64 aux_offset, aux_size; if (!event->rb) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; if (nr_pages > INT_MAX) return -ENOMEM; mutex_lock(&event->mmap_mutex); ret = -EINVAL; rb = event->rb; if (!rb) goto aux_unlock; aux_mutex = &rb->aux_mutex; mutex_lock(aux_mutex); aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) goto aux_unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) goto aux_unlock; if (!is_power_of_2(nr_pages)) goto aux_unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) goto aux_unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; goto unlock; } atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; goto accounting; } /* * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); if (event->rb) { if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* * Raced against perf_mmap_close(); remove the * event and try again. */ ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } /* We need the rb to map pages. */ rb = event->rb; goto unlock; } user_extra = nr_pages + 1; accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: */ user_lock_limit *= num_online_cpus(); user_locked = atomic_long_read(&user->locked_vm); /* * sysctl_perf_event_mlock may have changed, so that * user->locked_vm > user_lock_limit */ if (user_locked > user_lock_limit) user_locked = user_lock_limit; user_locked += user_extra; if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm */ extra = user_locked - user_lock_limit; user_extra -= extra; } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; if (!rb) { rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0, event->cpu, flags); if (!rb) { ret = -ENOMEM; goto unlock; } atomic_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { atomic_dec(&rb->mmap_count); } aux_unlock: if (aux_mutex) mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (!ret) ret = map_range(rb, vma); if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; } static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); if (retval < 0) return retval; return 0; } static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; /* * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } static void perf_sigtrap(struct perf_event *event) { /* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the * case on architectures that do not implement arch_irq_work_raise(). */ if (WARN_ON_ONCE(event->ctx->task != current)) return; /* * Both perf_pending_task() and perf_pending_irq() can race with the * task exiting. */ if (current->flags & PF_EXITING) return; send_sig_perf((void __user *)event->pending_addr, event->orig_type, event->attr.sig_data); } /* * Deliver the pending work in-event-context or follow the context. */ static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); /* * If the event isn't running; we done. event_sched_out() will have * taken care of things. */ if (cpu < 0) return; /* * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); } return; } /* * CPU-A CPU-B * * perf_event_disable_inatomic() * @pending_disable = CPU-A; * irq_work_queue(); * * sched-out * @pending_disable = -1; * * sched-in * perf_event_disable_inatomic() * @pending_disable = CPU-B; * irq_work_queue(); // FAILS * * irq_work_run() * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ irq_work_queue_on(&event->pending_disable_irq, cpu); } static void perf_pending_disable(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); __perf_pending_disable(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); /* * The wakeup isn't bound to the context of the event -- it can happen * irrespective of where the event is. */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } static void perf_pending_task(struct callback_head *head) { struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; /* * All accesses to the event must belong to the same implicit RCU read-side * critical section as the ->pending_work reset. See comment in * perf_pending_task_sync(). */ rcu_read_lock(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); static_call_update(__perf_guest_state, cbs->state); static_call_update(__perf_guest_get_ip, cbs->get_ip); /* Implementing ->handle_intel_pt_intr is optional. */ if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) return; rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif static bool should_sample_guest(struct perf_event *event) { return !event->attr.exclude_guest && perf_guest_state(); } unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs) { if (should_sample_guest(event)) return perf_arch_guest_misc_flags(regs); return perf_arch_misc_flags(regs); } unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs) { if (should_sample_guest(event)) return perf_guest_get_ip(); return perf_arch_instruction_pointer(regs); } static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; DECLARE_BITMAP(_mask, 64); bitmap_from_u64(_mask, mask); for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); perf_output_put(handle, val); } } static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; } } static void perf_sample_regs_intr(struct perf_regs *regs_intr, struct pt_regs *regs) { regs_intr->regs = regs; regs_intr->abi = perf_reg_abi(current); } /* * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) { unsigned long addr = perf_user_stack_pointer(regs); if (!addr || addr >= TASK_SIZE) return 0; return TASK_SIZE - addr; } static u16 perf_sample_ustack_size(u16 stack_size, u16 header_size, struct pt_regs *regs) { u64 task_size; /* No regs, no stack pointer, no dump. */ if (!regs) return 0; /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size * If we don't, we customize the stack size to * fit in to the remaining sample size. */ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); stack_size = min(stack_size, (u16) task_size); /* Current header size plus static size and dynamic size. */ header_size += 2 * sizeof(u64); /* Do we fit in with the current stack dump size? */ if ((u16) (header_size + stack_size) < header_size) { /* * If we overflow the maximum size for the sample, * we customize the stack dump size to fit in. */ stack_size = USHRT_MAX - header_size - sizeof(u64); stack_size = round_up(stack_size, sizeof(u64)); } return stack_size; } static void perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, struct pt_regs *regs) { /* Case of a kernel thread, nothing to dump */ if (!regs) { u64 size = 0; perf_output_put(handle, size); } else { unsigned long sp; unsigned int rem; u64 dyn_size; /* * We dump: * static size * - the size requested by user or the best one we can fit * in to the sample max size * data * - user stack dump data * dynamic size * - the actual dumped size */ /* Static size. */ perf_output_put(handle, dump_size); /* Data. */ sp = perf_user_stack_pointer(regs); rem = __output_copy_user(handle, (void *) sp, dump_size); dyn_size = dump_size - rem; perf_output_skip(handle, rem); /* Dynamic size. */ perf_output_put(handle, dyn_size); } } static unsigned long perf_prepare_sample_aux(struct perf_event *event, struct perf_sample_data *data, size_t size) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; data->aux_size = 0; if (!sampler) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; rb = ring_buffer_get(sampler); if (!rb) goto out; /* * If this is an NMI hit inside sampling code, don't take * the sample. See also perf_aux_sample_output(). */ if (READ_ONCE(rb->aux_in_sampling)) { data->aux_size = 0; } else { size = min_t(size_t, size, perf_aux_size(rb)); data->aux_size = ALIGN(size, sizeof(u64)); } ring_buffer_put(rb); out: return data->aux_size; } static long perf_pmu_snapshot_aux(struct perf_buffer *rb, struct perf_event *event, struct perf_output_handle *handle, unsigned long size) { unsigned long flags; long ret; /* * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler * paths. If we start calling them in NMI context, they may race with * the IRQ ones, that is, for example, re-starting an event that's just * been stopped, which is why we're using a separate callback that * doesn't change the event state. * * IRQs need to be disabled to prevent IPIs from racing with us. */ local_irq_save(flags); /* * Guard against NMI hits inside the critical section; * see also perf_prepare_sample_aux(). */ WRITE_ONCE(rb->aux_in_sampling, 1); barrier(); ret = event->pmu->snapshot_aux(event, handle, size); barrier(); WRITE_ONCE(rb->aux_in_sampling, 0); local_irq_restore(flags); return ret; } static void perf_aux_sample_output(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *data) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; unsigned long pad; long size; if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; rb = ring_buffer_get(sampler); if (!rb) return; size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); /* * An error here means that perf_output_copy() failed (returned a * non-zero surplus that it didn't copy), which in its current * enlightened implementation is not possible. If that changes, we'd * like to know. */ if (WARN_ON_ONCE(size < 0)) goto out_put; /* * The pad comes from ALIGN()ing data->aux_size up to u64 in * perf_prepare_sample_aux(), so should not be more than that. */ pad = data->aux_size - size; if (WARN_ON_ONCE(pad >= sizeof(u64))) pad = 8; if (pad) { u64 zero = 0; perf_output_copy(handle, &zero, pad); } out_put: ring_buffer_put(rb); } /* * A set of common sample data types saved even for non-sample records * when event->attr.sample_id_all is set. */ #define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) static void __perf_event_header__init_id(struct perf_sample_data *data, struct perf_event *event, u64 sample_type) { data->type = event->attr.sample_type; data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ data->tid_entry.pid = perf_event_pid(event, current); data->tid_entry.tid = perf_event_tid(event, current); } if (sample_type & PERF_SAMPLE_TIME) data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) data->stream_id = event->id; if (sample_type & PERF_SAMPLE_CPU) { data->cpu_entry.cpu = raw_smp_processor_id(); data->cpu_entry.reserved = 0; } } void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { if (event->attr.sample_id_all) { header->size += event->id_header_size; __perf_event_header__init_id(data, event, event->attr.sample_type); } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, struct perf_sample_data *data) { u64 sample_type = data->type; if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; bool self = has_inherit_and_sample_read(&event->attr); /* * Disabling interrupts avoids all counter scheduling * (context switches, timer based rotation and IPIs). */ local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if ((leader != event) && (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) /* * XXX PERF_SAMPLE_READ vs inherited events seems difficult. * * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. * * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread * counts rather than attempting to accumulate some value across all children on * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we are called in * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); else perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { u64 sample_type = data->type; perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; size += data->callchain->nr; size *= sizeof(u64); __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { struct perf_raw_record *raw = data->raw; if (raw) { struct perf_raw_frag *frag = &raw->frag; perf_output_put(handle, raw->size); do { if (frag->copy) { __output_custom(handle, frag->copy, frag->data, frag->size); } else { __output_copy(handle, frag->data, frag->size); } if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); if (frag->pad) __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; u32 data; } raw = { .size = sizeof(u32), .data = 0, }; perf_output_put(handle, raw); } } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); /* * Add the extension space which is appended * right after the struct perf_branch_stack. */ if (data->br_stack_cntr) { size = data->br_stack->nr * sizeof(u64); perf_output_copy(handle, data->br_stack_cntr, size); } } else { /* * we always store at least the value of nr */ u64 nr = 0; perf_output_put(handle, nr); } } if (sample_type & PERF_SAMPLE_REGS_USER) { u64 abi = data->regs_user.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_user; perf_output_sample_regs(handle, data->regs_user.regs, mask); } } if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); } if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); if (sample_type & PERF_SAMPLE_REGS_INTR) { u64 abi = data->regs_intr.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_intr; perf_output_sample_regs(handle, data->regs_intr.regs, mask); } } if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) perf_output_put(handle, data->code_page_size); if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); if (data->aux_size) perf_aux_sample_output(event, handle, data); } if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { local_sub(wakeup_events, &rb->events); local_inc(&rb->wakeup); } } } } static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; if (!virt) return 0; if (virt >= TASK_SIZE) { /* If it's vmalloc()d memory, leave phys_addr as 0 */ if (virt_addr_valid((void *)(uintptr_t)virt) && !(virt >= VMALLOC_START && virt < VMALLOC_END)) phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); } else { /* * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { struct page *p; pagefault_disable(); if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; put_page(p); } pagefault_enable(); } } return phys_addr; } /* * Return the pagetable size of a given virtual address. */ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; #ifdef CONFIG_HAVE_GUP_FAST pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pgdp = pgd_offset(mm, addr); pgd = READ_ONCE(*pgdp); if (pgd_none(pgd)) return 0; if (pgd_leaf(pgd)) return pgd_leaf_size(pgd); p4dp = p4d_offset_lockless(pgdp, pgd, addr); p4d = READ_ONCE(*p4dp); if (!p4d_present(p4d)) return 0; if (p4d_leaf(p4d)) return p4d_leaf_size(p4d); pudp = pud_offset_lockless(p4dp, p4d, addr); pud = READ_ONCE(*pudp); if (!pud_present(pud)) return 0; if (pud_leaf(pud)) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); again: pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; if (pmd_leaf(pmd)) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); if (!ptep) goto again; pte = ptep_get_lockless(ptep); if (pte_present(pte)) size = __pte_leaf_size(pmd, pte); pte_unmap(ptep); #endif /* CONFIG_HAVE_GUP_FAST */ return size; } static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; unsigned long flags; u64 size; if (!addr) return 0; /* * Software page-table walkers must disable IRQs, * which prevents any tear down of the page tables. */ local_irq_save(flags); mm = current->mm; if (!mm) { /* * For kernel threads and the like, use init_mm so that * we can find kernel memory. */ mm = &init_mm; } size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags); return size; } static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; if (!kernel && !user) return &__empty_callchain; callchain = get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) { return d * !!(flags & s); } void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; u64 filtered_sample_type; /* * Add the sample flags that are dependent to others. And clear the * sample flags that have already been done by the PMU driver. */ filtered_sample_type = sample_type; filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER); filtered_sample_type &= ~data->sample_flags; if (filtered_sample_type == 0) { /* Make sure it has the correct data->type for output */ data->type = event->attr.sample_type; return; } __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { data->ip = perf_instruction_pointer(event, regs); data->sample_flags |= PERF_SAMPLE_IP; } if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(data, event, regs); if (filtered_sample_type & PERF_SAMPLE_RAW) { data->raw = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_RAW; } if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { data->br_stack = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); /* * It cannot use the filtered_sample_type here as REGS_USER can be set * by STACK_USER (using __cond_set() above) and we don't want to update * the dyn_size if it's not requested by users. */ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_USER; } if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* * If there is something to dump, add space for the dump * itself and for the field that tells the dynamic size, * which is how many have been actually dumped. */ if (stack_size) size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_STACK_USER; } if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = PERF_MEM_NA; data->sample_flags |= PERF_SAMPLE_DATA_SRC; } if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = 0; data->sample_flags |= PERF_SAMPLE_TRANSACTION; } if (filtered_sample_type & PERF_SAMPLE_ADDR) { data->addr = 0; data->sample_flags |= PERF_SAMPLE_ADDR; } if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); perf_sample_regs_intr(&data->regs_intr, regs); if (data->regs_intr.regs) { u64 mask = event->attr.sample_regs_intr; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_INTR; } if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; } #ifdef CONFIG_CGROUP_PERF if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif /* * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; u16 header_size = perf_sample_data_size(data, event); header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can * easily overflow it, what with all the preceding sample bits. * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); WARN_ON_ONCE(size + header_size > U16_MAX); data->dyn_size += size + sizeof(u64); /* size above */ data->sample_flags |= PERF_SAMPLE_AUX; } } void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); header->misc = perf_misc_flags(event, regs); /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the * lowest 3 bits of size, which should be always zero at the moment. * This raises a more important question, do we really need 512k sized * samples and why, so good argumentation is in order for whatever you * do here next. */ WARN_ON_ONCE(header->size & 7); } static void __perf_event_aux_pause(struct perf_event *event, bool pause) { if (pause) { if (!event->hw.aux_paused) { event->hw.aux_paused = 1; event->pmu->stop(event, PERF_EF_PAUSE); } } else { if (event->hw.aux_paused) { event->hw.aux_paused = 0; event->pmu->start(event, PERF_EF_RESUME); } } } static void perf_event_aux_pause(struct perf_event *event, bool pause) { struct perf_buffer *rb; if (WARN_ON_ONCE(!event)) return; rb = ring_buffer_get(event); if (!rb) return; scoped_guard (irqsave) { /* * Guard against self-recursion here. Another event could trip * this same from NMI context. */ if (READ_ONCE(rb->aux_in_pause_resume)) break; WRITE_ONCE(rb->aux_in_pause_resume, 1); barrier(); __perf_event_aux_pause(event, pause); barrier(); WRITE_ONCE(rb->aux_in_pause_resume, 0); } ring_buffer_put(rb); } static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, struct perf_sample_data *, struct perf_event *, unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(data, event, regs); perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); exit: rcu_read_unlock(); return err; } void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_forward); } void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_backward); } int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_output(event, data, regs, perf_output_begin); } /* * read event_id */ struct perf_read_event { struct perf_event_header header; u32 pid; u32 tid; }; static void perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void perf_iterate_ctx(struct perf_event_context *ctx, perf_iterate_f output, void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (!all) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; } output(event, data); } } static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { /* * Skip events that are not fully formed yet; ensure that * if we observe event->ctx, both event and ctx will be * complete enough. See perf_install_in_context(). */ if (!smp_load_acquire(&event->ctx)) continue; if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; output(event, data); } } /* * Iterate all events that need to receive side-band events. * * For new callers; ensure that account_pmu_sb_event() includes * your event, otherwise it might not get delivered. */ static void perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; rcu_read_lock(); preempt_disable(); /* * If we have task_ctx != NULL we only notify the task context itself. * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { perf_iterate_ctx(task_ctx, output, data, false); goto done; } perf_iterate_sb_cpu(output, data); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); } /* * Clear all file-based filters at exec, they'll have to be * re-instated when/if these objects are mmapped again. */ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; restart++; } count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } void perf_event_exec(void) { struct perf_event_context *ctx; ctx = perf_pin_task_context(current); if (!ctx) return; perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); } struct remote_output { struct perf_buffer *rb; int err; }; static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; if (!has_aux(event)) return; if (!parent) parent = event; /* * In case of inheritance, it will be the parent that links to the * ring-buffer, but it will be the child that's actually using it. * * We are using event::rb to determine if the event should be stopped, * however this may race with ring_buffer_attach() (through set_output), * which will make us skip the event that actually needs to be stopped. * So ring_buffer_attach() has to stop an aux event before re-assigning * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); } static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; rcu_read_lock(); perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); return ro.err; } static void perf_pmu_output_stop(struct perf_event *event) { struct perf_event *iter; int err, cpu; restart: rcu_read_lock(); list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { /* * For per-CPU events, we need to make sure that neither they * nor their children are running; for cpu==-1 events it's * sufficient to stop the event itself if it's active, since * it can't have children. */ cpu = iter->cpu; if (cpu == -1) cpu = READ_ONCE(iter->oncpu); if (cpu == -1) continue; err = cpu_function_call(cpu, __perf_pmu_output_stop, event); if (err == -EAGAIN) { rcu_read_unlock(); goto restart; } } rcu_read_unlock(); } /* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { struct task_struct *task; struct perf_event_context *task_ctx; struct { struct perf_event_header header; u32 pid; u32 ppid; u32 tid; u32 ptid; u64 time; } event_id; }; static int perf_event_task_match(struct perf_event *event) { return event->attr.comm || event->attr.mmap || event->attr.mmap2 || event->attr.mmap_data || event->attr.task; } static void perf_event_task_output(struct perf_event *event, void *data) { struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; if (!perf_event_task_match(event)) return; perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.tid = perf_event_tid(event, task); if (task_event->event_id.header.type == PERF_RECORD_EXIT) { task_event->event_id.ppid = perf_event_pid(event, task->real_parent); task_event->event_id.ptid = perf_event_pid(event, task->real_parent); } else { /* PERF_RECORD_FORK */ task_event->event_id.ppid = perf_event_pid(event, current); task_event->event_id.ptid = perf_event_tid(event, current); } task_event->event_id.time = perf_event_clock(event); perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: task_event->event_id.header.size = size; } static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; if (!atomic_read(&nr_comm_events) && !atomic_read(&nr_mmap_events) && !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, .event_id = { .header = { .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ /* .tid */ /* .ptid */ /* .time */ }, }; perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); } /* * comm tracking */ struct perf_comm_event { struct task_struct *task; char *comm; int comm_size; struct { struct perf_event_header header; u32 pid; u32 tid; } event_id; }; static int perf_event_comm_match(struct perf_event *event) { return event->attr.comm; } static void perf_event_comm_output(struct perf_event *event, void *data) { struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; int ret; if (!perf_event_comm_match(event)) return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: comm_event->event_id.header.size = size; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; unsigned int size; memset(comm, 0, sizeof(comm)); strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ .event_id = { .header = { .type = PERF_RECORD_COMM, .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ /* .tid */ }, }; perf_event_comm_event(&comm_event); } /* * namespaces tracking */ struct perf_namespaces_event { struct task_struct *task; struct { struct perf_event_header header; u32 pid; u32 tid; u64 nr_namespaces; struct perf_ns_link_info link_info[NR_NAMESPACES]; } event_id; }; static int perf_event_namespaces_match(struct perf_event *event) { return event->attr.namespaces; } static void perf_event_namespaces_output(struct perf_event *event, void *data) { struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) return; perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); namespaces_event->event_id.tid = perf_event_tid(event, namespaces_event->task); perf_output_put(&handle, namespaces_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct path ns_path; struct inode *ns_inode; int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; path_put(&ns_path); } } void perf_event_namespaces(struct task_struct *task) { struct perf_namespaces_event namespaces_event; struct perf_ns_link_info *ns_link_info; if (!atomic_read(&nr_namespaces_events)) return; namespaces_event = (struct perf_namespaces_event){ .task = task, .event_id = { .header = { .type = PERF_RECORD_NAMESPACES, .misc = 0, .size = sizeof(namespaces_event.event_id), }, /* .pid */ /* .tid */ .nr_namespaces = NR_NAMESPACES, /* .link_info[NR_NAMESPACES] */ }, }; ns_link_info = namespaces_event.event_id.link_info; perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], task, &mntns_operations); #ifdef CONFIG_USER_NS perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], task, &userns_operations); #endif #ifdef CONFIG_NET_NS perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], task, &netns_operations); #endif #ifdef CONFIG_UTS_NS perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], task, &utsns_operations); #endif #ifdef CONFIG_IPC_NS perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], task, &ipcns_operations); #endif #ifdef CONFIG_PID_NS perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], task, &pidns_operations); #endif #ifdef CONFIG_CGROUPS perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], task, &cgroupns_operations); #endif perf_iterate_sb(perf_event_namespaces_output, &namespaces_event, NULL); } /* * cgroup tracking */ #ifdef CONFIG_CGROUP_PERF struct perf_cgroup_event { char *path; int path_size; struct { struct perf_event_header header; u64 id; char path[]; } event_id; }; static int perf_event_cgroup_match(struct perf_event *event) { return event->attr.cgroup; } static void perf_event_cgroup_output(struct perf_event *event, void *data) { struct perf_cgroup_event *cgroup_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = cgroup_event->event_id.header.size; int ret; if (!perf_event_cgroup_match(event)) return; perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; perf_output_put(&handle, cgroup_event->event_id); __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: cgroup_event->event_id.header.size = header_size; } static void perf_event_cgroup(struct cgroup *cgrp) { struct perf_cgroup_event cgroup_event; char path_enomem[16] = "//enomem"; char *pathname; size_t size; if (!atomic_read(&nr_cgroup_events)) return; cgroup_event = (struct perf_cgroup_event){ .event_id = { .header = { .type = PERF_RECORD_CGROUP, .misc = 0, .size = sizeof(cgroup_event.event_id), }, .id = cgroup_id(cgrp), }, }; pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (pathname == NULL) { cgroup_event.path = path_enomem; } else { /* just to be sure to have enough space for alignment */ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); cgroup_event.path = pathname; } /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(cgroup_event.path) + 1; while (!IS_ALIGNED(size, sizeof(u64))) cgroup_event.path[size++] = '\0'; cgroup_event.event_id.header.size += size; cgroup_event.path_size = size; perf_iterate_sb(perf_event_cgroup_output, &cgroup_event, NULL); kfree(pathname); } #endif /* * mmap tracking */ struct perf_mmap_event { struct vm_area_struct *vma; const char *file_name; int file_size; int maj, min; u64 ino; u64 ino_generation; u32 prot, flags; u8 build_id[BUILD_ID_SIZE_MAX]; u32 build_id_size; struct { struct perf_event_header header; u32 pid; u32 tid; u64 start; u64 len; u64 pgoff; } event_id; }; static int perf_event_mmap_match(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct vm_area_struct *vma = mmap_event->vma; int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) return; if (event->attr.mmap2) { mmap_event->event_id.header.type = PERF_RECORD_MMAP2; mmap_event->event_id.header.size += sizeof(mmap_event->maj); mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); mmap_event->event_id.header.size += sizeof(mmap_event->prot); mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); use_build_id = event->attr.build_id && mmap_event->build_id_size; if (event->attr.mmap2 && use_build_id) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { if (use_build_id) { u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; __output_copy(&handle, size, 4); __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); } else { perf_output_put(&handle, mmap_event->maj); perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: mmap_event->event_id.header.size = size; mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; if (vma->vm_flags & VM_WRITE) prot |= PROT_WRITE; if (vma->vm_flags & VM_EXEC) prot |= PROT_EXEC; if (vma->vm_flags & VM_MAYSHARE) flags = MAP_SHARED; else flags = MAP_PRIVATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = "//enomem"; goto cpy_name; } /* * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); goto got_name; } else { if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); if (!name) name = (char *)arch_vma_name(vma); if (!name) { if (vma_is_initial_heap(vma)) name = "[heap]"; else if (vma_is_initial_stack(vma)) name = "[stack]"; else name = "//anon"; } } cpy_name: strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(name)+1; while (!IS_ALIGNED(size, sizeof(u64))) name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; mmap_event->maj = maj; mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; mmap_event->prot = prot; mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } /* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { /* d_inode(NULL) won't be equal to any mapped user-space file */ if (!filter->path.dentry) return false; if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) return false; if (filter->offset + filter->size < offset) return false; return true; } static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, struct vm_area_struct *vma, struct perf_addr_filter_range *fr) { unsigned long vma_size = vma->vm_end - vma->vm_start; unsigned long off = vma->vm_pgoff << PAGE_SHIFT; struct file *file = vma->vm_file; if (!perf_addr_filter_match(filter, file, off, vma_size)) return false; if (filter->offset < off) { fr->start = vma->vm_start; fr->size = min(vma_size, filter->size - (off - filter->offset)); } else { fr->start = vma->vm_start + filter->offset - off; fr->size = min(vma->vm_end - fr->start, filter->size); } return true; } static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (perf_addr_filter_vma_adjust(filter, vma, &event->addr_filter_ranges[count])) restart++; count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } /* * Adjust all task's events' filters to the new vma */ static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; /* * Data tracing isn't supported yet and as such there is no need * to keep track of anything that isn't related to executable code: */ if (!(vma->vm_flags & VM_EXEC)) return; rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); rcu_read_unlock(); } void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ .event_id = { .header = { .type = PERF_RECORD_MMAP, .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, /* .maj (attr_mmap2 only) */ /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ /* .prot (attr_mmap2 only) */ /* .flags (attr_mmap2 only) */ }; perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 offset; u64 size; u64 flags; } rec = { .header = { .type = PERF_RECORD_AUX, .misc = 0, .size = sizeof(rec), }, .offset = head, .size = size, .flags = flags, }; int ret; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * Lost/dropped samples logging */ void perf_log_lost_samples(struct perf_event *event, u64 lost) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 lost; } lost_samples_event = { .header = { .type = PERF_RECORD_LOST_SAMPLES, .misc = 0, .size = sizeof(lost_samples_event), }, .lost = lost, }; perf_event_header__init_id(&lost_samples_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; perf_output_put(&handle, lost_samples_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * context_switch tracking */ struct perf_switch_event { struct task_struct *task; struct task_struct *next_prev; struct { struct perf_event_header header; u32 next_prev_pid; u32 next_prev_tid; } event_id; }; static int perf_event_switch_match(struct perf_event *event) { return event->attr.context_switch; } static void perf_event_switch_output(struct perf_event *event, void *data) { struct perf_switch_event *se = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_switch_match(event)) return; /* Only CPU-wide events are allowed to see next/prev pid/tid */ if (event->ctx->task) { se->event_id.header.type = PERF_RECORD_SWITCH; se->event_id.header.size = sizeof(se->event_id.header); } else { se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; se->event_id.header.size = sizeof(se->event_id); se->event_id.next_prev_pid = perf_event_pid(event, se->next_prev); se->event_id.next_prev_tid = perf_event_tid(event, se->next_prev); } perf_event_header__init_id(&se->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; if (event->ctx->task) perf_output_put(&handle, se->event_id.header); else perf_output_put(&handle, se->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in) { struct perf_switch_event switch_event; /* N.B. caller checks nr_switch_events != 0 */ switch_event = (struct perf_switch_event){ .task = task, .next_prev = next_prev, .event_id = { .header = { /* .type */ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, /* .size */ }, /* .next_prev_pid */ /* .next_prev_tid */ }, }; if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* * IRQ throttle logging */ static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 time; u64 id; u64 stream_id; } throttle_event = { .header = { .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; perf_output_put(&handle, throttle_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * ksymbol register/unregister tracking */ struct perf_ksymbol_event { const char *name; int name_len; struct { struct perf_event_header header; u64 addr; u32 len; u16 ksym_type; u16 flags; } event_id; }; static int perf_event_ksymbol_match(struct perf_event *event) { return event->attr.ksymbol; } static void perf_event_ksymbol_output(struct perf_event *event, void *data) { struct perf_ksymbol_event *ksymbol_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_ksymbol_match(event)) return; perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; perf_output_put(&handle, ksymbol_event->event_id); __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { struct perf_ksymbol_event ksymbol_event; char name[KSYM_NAME_LEN]; u16 flags = 0; int name_len; if (!atomic_read(&nr_ksymbol_events)) return; if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); if (unregister) flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; ksymbol_event = (struct perf_ksymbol_event){ .name = name, .name_len = name_len, .event_id = { .header = { .type = PERF_RECORD_KSYMBOL, .size = sizeof(ksymbol_event.event_id) + name_len, }, .addr = addr, .len = len, .ksym_type = ksym_type, .flags = flags, }, }; perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); return; err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } /* * bpf program load/unload tracking */ struct perf_bpf_event { struct bpf_prog *prog; struct { struct perf_event_header header; u16 type; u16 flags; u32 id; u8 tag[BPF_TAG_SIZE]; } event_id; }; static int perf_event_bpf_match(struct perf_event *event) { return event->attr.bpf_event; } static void perf_event_bpf_output(struct perf_event *event, void *data) { struct perf_bpf_event *bpf_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_bpf_match(event)) return; perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, bpf_event->event_id.header.size); if (ret) return; perf_output_put(&handle, bpf_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, prog->jited_len, unregister, prog->aux->ksym.name); for (i = 1; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, subprog->aux->ksym.name); } } void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { struct perf_bpf_event bpf_event; switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: if (atomic_read(&nr_ksymbol_events)) perf_event_bpf_emit_ksymbols(prog, type); break; default: return; } if (!atomic_read(&nr_bpf_events)) return; bpf_event = (struct perf_bpf_event){ .prog = prog, .event_id = { .header = { .type = PERF_RECORD_BPF_EVENT, .size = sizeof(bpf_event.event_id), }, .type = type, .flags = flags, .id = prog->aux->id, }, }; BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; size_t pad; u16 old_len; u16 new_len; struct { struct perf_event_header header; u64 addr; } event_id; }; static int perf_event_text_poke_match(struct perf_event *event) { return event->attr.text_poke; } static void perf_event_text_poke_output(struct perf_event *event, void *data) { struct perf_text_poke_event *text_poke_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u64 padding = 0; int ret; if (!perf_event_text_poke_match(event)) return; perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, text_poke_event->event_id.header.size); if (ret) return; perf_output_put(&handle, text_poke_event->event_id); perf_output_put(&handle, text_poke_event->old_len); perf_output_put(&handle, text_poke_event->new_len); __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); if (text_poke_event->pad) __output_copy(&handle, &padding, text_poke_event->pad); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { struct perf_text_poke_event text_poke_event; size_t tot, pad; if (!atomic_read(&nr_text_poke_events)) return; tot = sizeof(text_poke_event.old_len) + old_len; tot += sizeof(text_poke_event.new_len) + new_len; pad = ALIGN(tot, sizeof(u64)) - tot; text_poke_event = (struct perf_text_poke_event){ .old_bytes = old_bytes, .new_bytes = new_bytes, .pad = pad, .old_len = old_len, .new_len = new_len, .event_id = { .header = { .type = PERF_RECORD_TEXT_POKE, .misc = PERF_RECORD_MISC_KERNEL, .size = sizeof(text_poke_event.event_id) + tot + pad, }, .addr = (unsigned long)addr, }, }; perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); } void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; } static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u32 pid; u32 tid; } rec; int ret; if (event->parent) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.pid = perf_event_pid(event, current); rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 hw_id; } rec; int ret; if (event->parent) event = event->parent; rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.hw_id = hw_id; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } EXPORT_SYMBOL_GPL(perf_report_aux_output_id); static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { struct hw_perf_event *hwc = &event->hw; int ret = 0; u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { hwc->interrupts_seq = seq; hwc->interrupts = 1; } else { hwc->interrupts++; if (unlikely(throttle && hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } } if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_time_stamp; hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) perf_adjust_period(event, delta, hwc->last_period, true); } return ret; } int perf_event_account_interrupt(struct perf_event *event) { return __perf_event_account_interrupt(event, 1); } static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) { /* * Due to interrupt latency (AKA "skid"), we may enter the * kernel before taking an overflow, even if the PMU is only * counting user events. */ if (event->attr.exclude_kernel && !user_mode(regs)) return false; return true; } #ifdef CONFIG_BPF_SYSCALL static int bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct bpf_perf_event_data_kern ctx = { .data = data, .event = event, }; struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); if (prog) { perf_prepare_sample(data, event, regs); ret = bpf_prog_run(prog, &ctx); } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); return ret; } static inline int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; if (event->prog) return -EEXIST; if (prog->type != BPF_PROG_TYPE_PERF_EVENT) return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* * On perf_event with precise_ip, calling bpf_get_stack() * may trigger unwinder warnings and occasional crashes. * bpf_get_[stack|stackid] works around this issue by using * callchain attached to perf_sample_data. If the * perf_event does not full (kernel and user) callchain * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ return -EPROTO; } event->prog = prog; event->bpf_cookie = bpf_cookie; return 0; } static inline void perf_event_free_bpf_handler(struct perf_event *event) { struct bpf_prog *prog = event->prog; if (!prog) return; event->prog = NULL; bpf_prog_put(prog); } #else static inline int bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return 1; } static inline int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static inline void perf_event_free_bpf_handler(struct perf_event *event) { } #endif /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; /* * Non-sampling counters might still use the PMI to fold short * hardware counters, ignore those. */ if (unlikely(!is_sampling_event(event))) return 0; ret = __perf_event_account_interrupt(event, throttle); if (event->attr.aux_pause) perf_event_aux_pause(event->aux_event, true); if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && !bpf_overflow_handler(event, data, regs)) goto out; /* * XXX event_limit might not quite work as expected on inherited * events */ event->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); } if (event->attr.sigtrap) { /* * The desired behaviour of sigtrap vs invalid samples is a bit * tricky; on the one hand, one should not loose the SIGTRAP if * it is the first event, on the other hand, we should also not * trigger the WARN or override the data address. */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; enum task_work_notify_mode notify_mode; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) event->pending_addr = data->addr; } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. * * 2. Events that can overflow again before the IRQ- * work without user space progress (e.g. hrtimer). * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ WARN_ON_ONCE(event->pending_work != pending_id); } } READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } out: if (event->attr.aux_resume) perf_event_aux_pause(event->aux_event, false); return ret; } int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } /* * Generic software event infrastructure */ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; hwc->last_period = hwc->sample_period; old = local64_read(&hwc->period_left); do { val = old; if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; if (!overflow) overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. */ break; } throttle = 1; } } static void perf_swevent_event(struct perf_event *event, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; local64_add(nr, &event->count); if (!regs) return; if (!is_sampling_event(event)) return; if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); } else data->period = event->hw.last_period; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, data, regs); } int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; if (event->attr.exclude_kernel && !user_mode(regs)) return 1; } return 0; } static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { if (event->attr.type != type) return 0; if (event->attr.config != event_id) return 0; if (perf_exclude_event(event, regs)) return 0; return 1; } static inline u64 swevent_hash(u64 type, u32 event_id) { u64 val = event_id | (type << 32); return hash_64(val, SWEVENT_HLIST_BITS); } static inline struct hlist_head * __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) { u64 hash = swevent_hash(type, event_id); return &hlist->heads[hash]; } /* For the read side: events when they trigger */ static inline struct hlist_head * find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; u64 type = event->attr.type; /* * Event scheduling is always serialized against hlist allocation * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; rcu_read_lock(); head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); } DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); } void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { int rctx; preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (unlikely(rctx < 0)) goto fail; ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); fail: preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) { } static int perf_swevent_add(struct perf_event *event, int flags) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); return 0; } static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } static void perf_swevent_start(struct perf_event *event, int flags) { event->hw.state = 0; } static void perf_swevent_stop(struct perf_event *event, int flags) { event->hw.state = PERF_HES_STOPPED; } /* Deref the hlist from the update side */ static inline struct swevent_hlist * swevent_hlist_deref(struct swevent_htable *swhash) { return rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (!--swhash->hlist_refcount) swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) swevent_hlist_put_cpu(cpu); } static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; mutex_lock(&swhash->hlist_mutex); if (!swevent_hlist_deref(swhash) && cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); if (!hlist) { err = -ENOMEM; goto exit; } rcu_assign_pointer(swhash->swevent_hlist, hlist); } swhash->hlist_refcount++; exit: mutex_unlock(&swhash->hlist_mutex); return err; } static int swevent_hlist_get(void) { int err, cpu, failed_cpu; mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; } } mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; swevent_hlist_put_cpu(cpu); } mutex_unlock(&pmus_lock); return err; } struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { u64 event_id = event->attr.config; WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(); } static struct pmu perf_cpu_clock; /* fwd declaration */ static struct pmu perf_task_clock; static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: event->attr.type = perf_cpu_clock.type; return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: event->attr.type = perf_task_clock.type; return -ENOENT; default: break; } if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { int err; err = swevent_hlist_get(); if (err) return err; static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; #ifdef CONFIG_EVENT_TRACING static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_destroy(event); } static int perf_tp_event_init(struct perf_event *event) { int err; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; /* * no branch sampling for tracepoint events */ if (has_branch_stack(event)) return -EOPNOTSUPP; err = perf_trace_init(event); if (err) return err; event->destroy = tp_perf_event_destroy; return 0; } static struct pmu perf_tracepoint = { .task_ctx_nr = perf_sw_context, .event_init = perf_tp_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; static int perf_tp_filter_match(struct perf_event *event, struct perf_raw_record *raw) { void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) event = event->parent; if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; } static int perf_tp_event_match(struct perf_event *event, struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 0; /* * If exclude_kernel, only trace user-space tracepoints (uprobes) */ if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, raw)) return 0; return 1; } void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_raw_record *raw, struct perf_event *event) { struct trace_entry *entry = record; if (event->attr.config != entry->type) return; /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; if (perf_tp_event_match(event, raw, regs)) { perf_sample_data_init(data, 0, 0); perf_sample_save_raw_data(data, event, raw); perf_swevent_event(event, count, data, regs); } } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_raw_record *raw, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); struct pmu *pmu = &perf_tracepoint; struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } } void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; struct perf_raw_record raw = { .frag = { .size = entry_size, .data = record, }, }; perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and * need to be re-computed for different sweveents. * Re-initialize data->sample_flags safely to avoid * the problem that next event skips preparing data * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, event, &raw); perf_swevent_event(event, count, &data, regs); } } /* * If we got specified a target task, also iterate its context and * deliver this event there too. */ if (task && task != current) { struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; raw_spin_lock(&ctx->lock); perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe * * The following values specify a reference counter (or semaphore in the * terminology of tools like dtrace, systemtap, etc.) Userspace Statically * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. * * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); #endif #ifdef CONFIG_KPROBE_EVENTS static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; static struct attribute_group kprobe_format_group = { .name = "format", .attrs = kprobe_attrs, }; static const struct attribute_group *kprobe_attr_groups[] = { &kprobe_format_group, NULL, }; static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_kprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) { int err; bool is_retprobe; if (event->attr.type != perf_kprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; err = perf_kprobe_init(event, is_retprobe); if (err) return err; event->destroy = perf_kprobe_destroy; return 0; } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); static struct attribute *uprobe_attrs[] = { &format_attr_retprobe.attr, &format_attr_ref_ctr_offset.attr, NULL, }; static struct attribute_group uprobe_format_group = { .name = "format", .attrs = uprobe_attrs, }; static const struct attribute_group *uprobe_attr_groups[] = { &uprobe_format_group, NULL, }; static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_uprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; event->destroy = perf_uprobe_destroy; return 0; } #endif /* CONFIG_UPROBE_EVENTS */ static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif #ifdef CONFIG_UPROBE_EVENTS perf_pmu_register(&perf_uprobe, "uprobe", -1); #endif } static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); } /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() */ static inline bool perf_event_is_tracing(struct perf_event *event) { if (event->pmu == &perf_tracepoint) return true; #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; #endif #ifdef CONFIG_UPROBE_EVENTS if (event->pmu == &perf_uprobe) return true; #endif return false; } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe) /* only uprobe programs are allowed to be sleepable */ return -EINVAL; /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) return -EACCES; } return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } perf_event_detach_bpf_prog(event); } #else static inline void perf_tp_register(void) { } static void perf_event_free_filter(struct perf_event *event) { } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; } void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT void perf_bp_event(struct perf_event *bp, void *data) { struct perf_sample_data sample; struct pt_regs *regs = data; perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); } #endif /* * Allocate a new address filter */ static struct perf_addr_filter * perf_addr_filter_new(struct perf_event *event, struct list_head *filters) { int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); struct perf_addr_filter *filter; filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); if (!filter) return NULL; INIT_LIST_HEAD(&filter->entry); list_add_tail(&filter->entry, filters); return filter; } static void free_filters_list(struct list_head *filters) { struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { path_put(&filter->path); list_del(&filter->entry); kfree(filter); } } /* * Free existing address filters and optionally install new ones */ static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head) { unsigned long flags; LIST_HEAD(list); if (!has_addr_filter(event)) return; /* don't bother with children, they don't have their own filters */ if (event->parent) return; raw_spin_lock_irqsave(&event->addr_filters.lock, flags); list_splice_init(&event->addr_filters.list, &list); if (head) list_splice(head, &event->addr_filters.list); raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); free_filters_list(&list); } /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (perf_addr_filter_vma_adjust(filter, vma, fr)) return; } } /* * Update event's address range filters based on the * task's existing mappings, if any. */ static void perf_event_addr_filters_apply(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct task_struct *task = READ_ONCE(event->ctx->task); struct perf_addr_filter *filter; struct mm_struct *mm = NULL; unsigned int count = 0; unsigned long flags; /* * We may observe TASK_TOMBSTONE, which means that the event tear-down * will stop on the parent's child_mutex that our caller is also holding */ if (task == TASK_TOMBSTONE) return; if (ifh->nr_file_filters) { mm = get_task_mm(task); if (!mm) goto restart; mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { /* * Adjust base offset if the filter is associated to a * binary that needs to be mapped: */ event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); } else { event->addr_filter_ranges[count].start = filter->offset; event->addr_filter_ranges[count].size = filter->size; } count++; } event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { mmap_read_unlock(mm); mmput(mm); } restart: perf_event_stop(event, 1); } /* * Address range filtering: limiting the data to certain * instruction address ranges. Filters are ioctl()ed to us from * userspace as ascii strings. * * Filter string format: * * ACTION RANGE_SPEC * where ACTION is one of the * * "filter": limit the trace to this region * * "start": start tracing from this address * * "stop": stop tracing at this address/region; * RANGE_SPEC is * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * * if <size> is not specified or is zero, the range is treated as a single * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, IF_SRC_FILE, IF_SRC_KERNEL, IF_SRC_FILEADDR, IF_SRC_KERNELADDR, }; enum { IF_STATE_ACTION = 0, IF_STATE_SOURCE, IF_STATE_END, }; static const match_table_t if_tokens = { { IF_ACT_FILTER, "filter" }, { IF_ACT_START, "start" }, { IF_ACT_STOP, "stop" }, { IF_SRC_FILE, "%u/%u@%s" }, { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, { IF_ACT_NONE, NULL }, }; /* * Address filter string parser */ static int perf_event_parse_addr_filter(struct perf_event *event, char *fstr, struct list_head *filters) { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; int ret = -EINVAL; orig = fstr = kstrdup(fstr, GFP_KERNEL); if (!fstr) return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { static const enum perf_addr_filter_action_t actions[] = { [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, }; ret = -EINVAL; if (!*start) continue; /* filter definition begins */ if (state == IF_STATE_ACTION) { filter = perf_addr_filter_new(event, filters); if (!filter) goto fail; } token = match_token(start, if_tokens, args); switch (token) { case IF_ACT_FILTER: case IF_ACT_START: case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; filter->action = actions[token]; state = IF_STATE_SOURCE; break; case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail; *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) goto fail; } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; } } state = IF_STATE_END; break; default: goto fail; } /* * Filter definition is fully parsed, validate and install it. * Make sure that it doesn't contradict itself or the event's * attribute. */ if (state == IF_STATE_END) { ret = -EINVAL; /* * ACTION "filter" must have a non-zero length region * specified. */ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && !filter->size) goto fail; if (!kernel) { if (!filename) goto fail; /* * For now, we only support file-based filters * in per-task events; doing so for CPU-wide * events requires additional context switching * trickery, since same object code will be * mapped at different virtual addresses in * different processes. */ ret = -EOPNOTSUPP; if (!event->ctx->task) goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) goto fail; ret = -EINVAL; if (!filter->path.dentry || !S_ISREG(d_inode(filter->path.dentry) ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ kfree(filename); filename = NULL; state = IF_STATE_ACTION; filter = NULL; kernel = 0; } } if (state != IF_STATE_ACTION) goto fail; kfree(filename); kfree(orig); return 0; fail: kfree(filename); free_filters_list(filters); kfree(orig); return ret; } static int perf_event_set_addr_filter(struct perf_event *event, char *filter_str) { LIST_HEAD(filters); int ret; /* * Since this is called in perf_ioctl() path, we're already holding * ctx::mutex. */ lockdep_assert_held(&event->ctx->mutex); if (WARN_ON_ONCE(event->parent)) return -EINVAL; ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); if (ret) goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); /* install new filters */ perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; fail_free_filters: free_filters_list(&filters); fail_clear_files: event->addr_filters.nr_file_filters = 0; return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) { int ret = -EINVAL; char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); #ifdef CONFIG_EVENT_TRACING if (perf_event_is_tracing(event)) { struct perf_event_context *ctx = event->ctx; /* * Beware, here be dragons!! * * the tracepoint muck will deadlock against ctx->mutex, but * the tracepoint stuff does not actually need it. So * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we * already have a reference on ctx. * * This can result in event getting moved to a different ctx, * but that does not affect the tracepoint state. */ mutex_unlock(&ctx->mutex); ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); mutex_lock(&ctx->mutex); } else #endif if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); return ret; } /* * hrtimer based swevent callback */ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; struct perf_event *event; u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); if (event->state != PERF_EVENT_STATE_ACTIVE) return HRTIMER_NORESTART; event->pmu->read(event); perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs)) ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 period; if (!is_sampling_event(event)) return; period = local64_read(&hwc->period_left); if (period) { if (period < 0) period = 10000; local64_set(&hwc->period_left, 0); } else { period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); hrtimer_cancel(&hwc->hrtimer); } } static void perf_swevent_init_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!is_sampling_event(event)) return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* * Since hrtimers have a fixed rate, we can do a static freq->period * mapping and avoid the whole period adjust feedback stuff. */ if (event->attr.freq) { long freq = event->attr.sample_freq; event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } /* * Software event: cpu wall time clock */ static void cpu_clock_event_update(struct perf_event *event) { s64 prev; u64 now; now = local_clock(); prev = local64_xchg(&event->hw.prev_count, now); local64_add(now - prev, &event->count); } static void cpu_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void cpu_clock_event_del(struct perf_event *event, int flags) { cpu_clock_event_stop(event, flags); } static void cpu_clock_event_read(struct perf_event *event) { cpu_clock_event_update(event); } static int cpu_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, }; /* * Software event: task time clock */ static void task_clock_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; local64_add(delta, &event->count); } static void task_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void task_clock_event_del(struct perf_event *event, int flags) { task_clock_event_stop(event, PERF_EF_UPDATE); } static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); u64 delta = now - event->ctx->timestamp; u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } static int task_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, }; static void perf_pmu_nop_void(struct pmu *pmu) { } static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) { } static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } static int perf_event_nop_int(struct perf_event *event, u64 value) { return 0; } static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { __this_cpu_write(nop_txn_flags, flags); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return 0; perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_enable(pmu); } static int perf_event_idx_default(struct perf_event *event) { return 0; } static void free_pmu_context(struct pmu *pmu) { free_percpu(pmu->cpu_pmu_context); } /* * Let userspace know that this PMU supports address range filtering: */ static ssize_t nr_addr_filters_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); static struct idr pmu_idr; static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pmu *pmu = dev_get_drvdata(dev); int timer, cpu, ret; ret = kstrtoint(buf, 0, &timer); if (ret) return ret; if (timer < 1) return -EINVAL; /* same value, noting to do */ if (timer == pmu->hrtimer_interval_ms) return count; mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) { switch (scope) { case PERF_PMU_SCOPE_CORE: return topology_sibling_cpumask(cpu); case PERF_PMU_SCOPE_DIE: return topology_die_cpumask(cpu); case PERF_PMU_SCOPE_CLUSTER: return topology_cluster_cpumask(cpu); case PERF_PMU_SCOPE_PKG: return topology_core_cpumask(cpu); case PERF_PMU_SCOPE_SYS_WIDE: return cpu_online_mask; } return NULL; } static inline struct cpumask *perf_scope_cpumask(unsigned int scope) { switch (scope) { case PERF_PMU_SCOPE_CORE: return perf_online_core_mask; case PERF_PMU_SCOPE_DIE: return perf_online_die_mask; case PERF_PMU_SCOPE_CLUSTER: return perf_online_cluster_mask; case PERF_PMU_SCOPE_PKG: return perf_online_pkg_mask; case PERF_PMU_SCOPE_SYS_WIDE: return perf_online_sys_mask; } return NULL; } static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pmu *pmu = dev_get_drvdata(dev); struct cpumask *mask = perf_scope_cpumask(pmu->scope); if (mask) return cpumap_print_to_pagebuf(true, buf, mask); return 0; } static DEVICE_ATTR_RO(cpumask); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, &dev_attr_cpumask.attr, NULL, }; static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct pmu *pmu = dev_get_drvdata(dev); if (n == 2 && !pmu->nr_addr_filters) return 0; /* cpumask */ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) return 0; return a->mode; } static struct attribute_group pmu_dev_attr_group = { .is_visible = pmu_dev_is_visible, .attrs = pmu_dev_attrs, }; static const struct attribute_group *pmu_dev_groups[] = { &pmu_dev_attr_group, NULL, }; static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) { kfree(dev); } static int pmu_dev_alloc(struct pmu *pmu) { int ret = -ENOMEM; pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!pmu->dev) goto out; pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; ret = device_add(pmu->dev); if (ret) goto free_dev; if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); if (ret) goto del_dev; } out: return ret; del_dev: device_del(pmu->dev); free_dev: put_device(pmu->dev); goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; pmu->type = -1; if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { ret = -EINVAL; goto free_pdc; } if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { ret = -EINVAL; goto free_pdc; } pmu->name = name; if (type >= 0) max = type; ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; WARN_ON(type >= 0 && ret != type); type = ret; pmu->type = type; if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } if (!pmu->start_txn) { if (pmu->pmu_enable) { /* * If we have pmu_enable/pmu_disable calls, install * transaction stubs that use that to try and batch * hardware accesses. */ pmu->start_txn = perf_pmu_start_txn; pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } } if (!pmu->pmu_enable) { pmu->pmu_enable = perf_pmu_nop_void; pmu->pmu_disable = perf_pmu_nop_void; } if (!pmu->check_period) pmu->check_period = perf_event_nop_int; if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; free_dev: if (pmu->dev && pmu->dev != PMU_NULL_DEV) { device_del(pmu->dev); put_device(pmu->dev); } free_idr: idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); free_percpu(pmu->pmu_disable_count); idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); static inline bool has_extended_regs(struct perf_event *event) { return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); } static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; /* * A number of pmu->event_init() methods iterate the sibling_list to, * for example, validate if the group fits on the PMU. Therefore, * if this is a sibling event, acquire the ctx->mutex to protect * the sibling_list. */ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. */ ctx = perf_event_ctx_lock_nested(event->group_leader, SINGLE_DEPTH_NESTING); BUG_ON(!ctx); } event->pmu = pmu; ret = pmu->event_init(event); if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && has_extended_regs(event)) ret = -EOPNOTSUPP; if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && event_has_any_exclude_flag(event)) ret = -EINVAL; if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); int cpu; if (pmu_cpumask && cpumask) { cpu = cpumask_any_and(pmu_cpumask, cpumask); if (cpu >= nr_cpu_ids) ret = -ENODEV; else event->event_caps |= PERF_EV_CAP_READ_SCOPE; } else { ret = -ENODEV; } } if (ret && event->destroy) event->destroy(event); } if (ret) module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; int idx, type, ret; struct pmu *pmu; idx = srcu_read_lock(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain * pmus overwrites event->attr.type to forward event to another pmu. */ event->orig_type = event->attr.type; /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; } /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { type = event->attr.config >> PERF_PMU_TYPE_SHIFT; if (!type) { type = PERF_TYPE_RAW; } else { extended_type = true; event->attr.config &= PERF_HW_EVENT_MASK; } } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) goto fail; ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; if (ret != -ENOENT) { pmu = ERR_PTR(ret); goto unlock; } } fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); return pmu; } static void attach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_add_rcu(&event->sb_list, &pel->list); raw_spin_unlock(&pel->lock); } /* * We keep a list of all !task (and therefore per-cpu) events * that need to receive side-band records. * * This avoids having to scan all the various PMU per-cpu contexts * looking for them. */ static void account_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) attach_sb_event(event); } /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL /* Lock so we don't race with concurrent unaccount */ spin_lock(&nr_freq_lock); if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void account_freq_event(void) { if (tick_nohz_full_enabled()) account_freq_event_nohz(); else atomic_inc(&nr_freq_events); } static void account_event(struct perf_event *event) { bool inc = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.build_id) atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); if (event->attr.cgroup) atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; } if (has_branch_stack(event)) inc = true; if (is_cgroup_event(event)) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); if (event->attr.text_poke) atomic_inc(&nr_text_poke_events); if (inc) { /* * We need the mutex here because static_branch_enable() * must complete *before* the perf_sched_count increment * becomes visible. */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; mutex_lock(&perf_sched_mutex); if (!atomic_read(&perf_sched_count)) { static_branch_enable(&perf_sched_events); /* * Guarantee that all CPUs observe they key change and * call the perf scheduling hooks before proceeding to * install events that need them. */ synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further * increments to by-pass the mutex. */ atomic_inc(&perf_sched_count); mutex_unlock(&perf_sched_mutex); } enabled: account_pmu_sb_event(event); } /* * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } if (attr->sigtrap && !task) { /* Requires a task: avoid signalling random tasks. */ return ERR_PTR(-EINVAL); } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); /* * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) group_leader = event; mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->active_list); init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; event->oncpu = -1; event->parent = parent_event; event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; if (parent_event) event->event_caps = parent_event->event_caps; if (task) { event->attach_state = PERF_ATTACH_TASK; /* * XXX pmu::event_init needs to know what task to account to * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ event->hw.target = get_task_struct(task); } event->clock = &local_clock; if (parent_event) event->clock = parent_event->clock; if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (parent_event->prog) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; } #endif } if (overflow_handler) { event->overflow_handler = overflow_handler; event->overflow_handler_context = context; } else if (is_write_backward(event)){ event->overflow_handler = perf_event_output_backward; event->overflow_handler_context = NULL; } else { event->overflow_handler = perf_event_output_forward; event->overflow_handler_context = NULL; } perf_event__state_init(event); pmu = NULL; hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); /* * We do not support PERF_SAMPLE_READ on inherited events unless * PERF_SAMPLE_TID is also selected, which allows inherited events to * collect per-thread samples. * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } /* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } if (event->attr.aux_pause && event->attr.aux_resume) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_start_paused) { if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { err = -EOPNOTSUPP; goto err_pmu; } event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) goto err_pmu; } err = exclusive_event_init(event); if (err) goto err_pmu; if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } /* * Clone the parent's vma offsets: they are valid until exec() * even if the mm is not shared with the parent. */ if (event->parent) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); memcpy(event->addr_filter_ranges, event->parent->addr_filter_ranges, pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } /* force hw sync on the address filters */ event->addr_filters_gen = 1; } if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } } err = security_perf_event_alloc(event); if (err) goto err_callchain_buffer; /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; err_callchain_buffer: if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } err_addr_filters: kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); err_pmu: if (is_cgroup_event(event)) perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: if (event->hw.target) put_task_struct(event->hw.target); call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr) { u32 size; int ret; /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; /* ABI compatibility quirk: */ if (!size) size = PERF_ATTR_SIZE_VER0; if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); if (ret) { if (ret == -E2BIG) goto err_size; return ret; } attr->size = size; if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) return -EINVAL; if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { u64 mask = attr->branch_sample_type; /* only using defined bits */ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) return -EINVAL; /* at least one branch bit must be set */ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { /* exclude_kernel checked on syscall entry */ if (!attr->exclude_kernel) mask |= PERF_SAMPLE_BRANCH_KERNEL; if (!attr->exclude_user) mask |= PERF_SAMPLE_BRANCH_USER; if (!attr->exclude_hv) mask |= PERF_SAMPLE_BRANCH_HV; /* * adjust user setting (for HW filter setup) */ attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { ret = perf_allow_kernel(attr); if (ret) return ret; } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); if (ret) return ret; } if (attr->sample_type & PERF_SAMPLE_STACK_USER) { if (!arch_perf_have_user_stack_dump()) return -ENOSYS; /* * We have __u32 type for the size, but so far * we can only use __u16 as maximum due to the * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) return -EINVAL; } if (!attr->sample_max_stack) attr->sample_max_stack = sysctl_perf_event_max_stack; if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); #ifndef CONFIG_CGROUP_PERF if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL; if (!attr->inherit && attr->inherit_thread) return -EINVAL; if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL; if (attr->sigtrap && !attr->remove_on_exec) return -EINVAL; out: return ret; err_size: put_user(sizeof(*attr), &uattr->size); ret = -E2BIG; goto out; } static void mutex_lock_double(struct mutex *a, struct mutex *b) { if (b < a) swap(a, b); mutex_lock(a); mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) { mutex_lock(&event->mmap_mutex); goto set; } /* don't allow circular references */ if (event == output_event) goto out; /* * Don't allow cross-cpu buffers */ if (output_event->cpu != event->cpu) goto out; /* * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* * Mixing clocks in the same buffer is trouble you don't need. */ if (output_event->clock != event->clock) goto out; /* * Either writing ring buffer from beginning or from end. * Mixing is not allowed. */ if (is_write_backward(output_event) != is_write_backward(event)) goto out; /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && event->pmu != output_event->pmu) goto out; /* * Hold both mmap_mutex to serialize against perf_mmap_close(). Since * output_event is already on rb->event_list, and the list iteration * restarts after every removal, it is guaranteed this new event is * observed *OR* if output_event is already removed, it's guaranteed we * observe !rb->mmap_count. */ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; /* did we race against perf_mmap_close() */ if (!atomic_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } } ring_buffer_attach(event, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); if (output_event) mutex_unlock(&output_event->mmap_mutex); out: return ret; } static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; switch (clk_id) { case CLOCK_MONOTONIC: event->clock = &ktime_get_mono_fast_ns; nmi_safe = true; break; case CLOCK_MONOTONIC_RAW: event->clock = &ktime_get_raw_fast_ns; nmi_safe = true; break; case CLOCK_REALTIME: event->clock = &ktime_get_real_ns; break; case CLOCK_BOOTTIME: event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: event->clock = &ktime_get_clocktai_ns; break; default: return -EINVAL; } if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) return -EINVAL; return 0; } static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; bool is_capable = perfmon_capable(); if (attr->sigtrap) { /* * perf_event_attr::sigtrap sends signals to the other task. * Require the current task to also have CAP_KILL. */ rcu_read_lock(); is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); rcu_read_unlock(); /* * If the required capabilities aren't available, checks for * ptrace permissions: upgrade to ATTACH, since sending signals * can effectively change the target task. */ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; } /* * Preserve ptrace permission check for backwards compatibility. The * ptrace check also includes checks that the current task and other * task have matching uids, and is therefore not done here explicitly. */ return is_capable || ptrace_may_access(task, ptrace_mode); } /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; int err; int f_flags = O_RDWR; int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); if (err) return err; /* Do we allow access to perf_event_open(2) ? */ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { err = perf_allow_kernel(&attr); if (err) return err; } if (attr.namespaces) { if (!perfmon_capable()) return -EACCES; } if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } else { if (attr.sample_period & (1ULL << 63)) return -EINVAL; } /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { err = perf_allow_kernel(&attr); if (err) return err; } /* REGS_INTR can leak data, lockdown must prevent this */ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { err = security_locked_down(LOCKDOWN_PERF); if (err) return err; } /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument * designates the cpu on which to monitor threads from that * cgroup. */ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; if (flags & PERF_FLAG_FD_CLOEXEC) f_flags |= O_CLOEXEC; event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { err = -EBADF; goto err_fd; } group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) group_leader = NULL; } if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); goto err_fd; } } if (task && group_leader && group_leader->attr.inherit != attr.inherit) { err = -EINVAL; goto err_task; } if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; } if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -EOPNOTSUPP; goto err_alloc; } } /* * Special case software events and allow them to be part of * any hardware group. */ pmu = event->pmu; if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) goto err_alloc; } if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; if (task) { err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_alloc; /* * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). */ err = -EACCES; if (!perf_check_permission(&attr, task)) goto err_cred; } /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_cred; } mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); if (!cpuctx->online) { err = -ENODEV; goto err_locked; } } if (group_leader) { err = -EINVAL; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) goto err_locked; /* * Make sure we're both events for the same CPU; * grouping events for different CPUs is broken; since * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) goto err_locked; /* * Make sure we're both on the same context; either task or cpu. */ if (group_leader->ctx != ctx) goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) goto err_locked; if (is_software_event(event) && !in_software_context(group_leader)) { /* * If the event is a sw event, but the group_leader * is on hw context. * * Allow the addition of software events to hw * groups, this is safe because software events * never fail to schedule. * * Note the comment that goes with struct * perf_event_pmu_context. */ pmu = group_leader->pmu_ctx->pmu; } else if (!is_software_event(event)) { if (is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to * the hardware context. */ move_group = 1; } /* Don't allow group of multiple hw events from different pmus */ if (!in_software_context(group_leader) && group_leader->pmu_ctx->pmu != pmu) goto err_locked; } } /* * Now that we're certain of the pmu; find the pmu_ctx. */ pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_locked; } event->pmu_ctx = pmu_ctx; if (output_event) { err = perf_event_set_output(event, output_event); if (err) goto err_context; } if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; goto err_context; } /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); event_file = NULL; goto err_context; } /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { perf_remove_from_context(group_leader, 0); put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); } /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group * (through the sibling list, which is still in-tact), we can * end up with siblings installed in the wrong context. * * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { sibling->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); } /* * Removing from the context ends up with disabled * event. What we want here is event in the initial * startup state, ready to be add into new context. */ group_leader->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); } /* * Precalculate sample_data sizes; do while holding ctx::mutex such * that we're serialized against further additions and before * perf_install_in_context() which is the point the event is active and * can use these values. */ perf_event__header_size(event); perf_event__id_header_size(event); event->owner = current; perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); if (task) { up_read(&task->signal->exec_update_lock); put_task_struct(task); } mutex_lock(&current->perf_event_mutex); list_add_tail(&event->owner_entry, &current->perf_event_list); mutex_unlock(&current->perf_event_mutex); /* * File reference in group guarantees that group_leader has been * kept alive until we place the new event on the sibling_list. * This ensures destruction of the group leader will find * the pointer to itself in perf_group_detach(). */ fd_install(event_fd, event_file); return event_fd; err_context: put_pmu_ctx(event->pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: free_event(event); err_task: if (task) put_task_struct(task); err_fd: put_unused_fd(event_fd); return err; } /** * perf_event_create_kernel_counter * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) * @overflow_handler: callback to trigger when we hit the event * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t overflow_handler, void *context) { struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; struct pmu *pmu; int err; /* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; } /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; pmu = event->pmu; if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_unlock; } pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_unlock; } event->pmu_ctx = pmu_ctx; if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; err_pmu_ctx: put_pmu_ctx(pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); static void __perf_pmu_remove(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct perf_event_groups *groups, struct list_head *events) { struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } } } static void __perf_pmu_install_event(struct pmu *pmu, struct perf_event_context *ctx, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; struct perf_event_context *old_ctx = event->ctx; get_ctx(ctx); /* normally find_get_context() */ event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); event->pmu_ctx = epc; if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); /* * Now that event->ctx is updated and visible, put the old ctx. */ put_ctx(old_ctx); } static void __perf_pmu_install(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct list_head *events) { struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. * * Skip over group leaders and only install siblings on this first * pass, siblings will not get enabled without a leader, however a * leader will enable its siblings, even if those are still on the old * context. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } } void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) { struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); /* * Since per-cpu context is persistent, no need to grab an extra * reference. */ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); if (!list_empty(&events)) { /* * Wait for the events to quiesce before re-instating them. */ synchronize_rcu(); __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); } mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) { struct task_struct *task = child_event->ctx->task; if (task && task != TASK_TOMBSTONE) perf_event_read_event(child_event, task); } child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); } static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent_event = event->parent; unsigned long detach_flags = 0; if (parent_event) { /* * Do not destroy the 'original' grouping; because of the * context switch optimization the original events could've * ended up in a random child task. * * If we were to destroy the original group, all group related * operations would cease to function properly after this * random child dies. * * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ detach_flags = DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } perf_remove_from_context(event, detach_flags); raw_spin_lock_irq(&ctx->lock); if (event->state > PERF_EVENT_STATE_EXIT) perf_event_set_state(event, PERF_EVENT_STATE_EXIT); raw_spin_unlock_irq(&ctx->lock); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); /* * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); free_event(event); put_event(parent_event); return; } /* * Parent events are governed by their filedesc, retain them. */ perf_event_wakeup(event); } static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); child_ctx = perf_pin_task_context(child); if (!child_ctx) return; /* * In order to reduce the amount of tricky in ctx tear-down, we hold * ctx::mutex over the entire thing. This serializes against almost * everything that wants to access the ctx. * * The exception is sys_perf_event_open() / * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ mutex_lock(&child_ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ clone_ctx = unclone_ctx(child_ctx); raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); } /* * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); /* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task * with child's task_ctx, which generates EXIT events for * child contexts and sets child->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) return; mutex_lock(&parent->child_mutex); list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); put_event(parent); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* * Free a context as created by inheritance by perf_event_init_task() below, * used by fork() in case of fail. * * Even though the task has never lived, the context and events have been * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; ctx = rcu_access_pointer(task->perf_event_ctxp); if (!ctx) return; mutex_lock(&ctx->mutex); raw_spin_lock_irq(&ctx->lock); /* * Destroy the task <-> ctx relation and mark the context dead. * * This is important because even though the task hasn't been * exposed yet the context has been (through child_list). */ RCU_INIT_POINTER(task->perf_event_ctxp, NULL); WRITE_ONCE(ctx->task, TASK_TOMBSTONE); put_task_struct(task); /* cannot be last */ raw_spin_unlock_irq(&ctx->lock); list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); /* * perf_event_release_kernel() could've stolen some of our * child events and still have them on its free_list. In that * case we must wait for these events to have been freed (in * particular all their references to this task must've been * dropped). * * Without this copy_process() will unconditionally free this * task (irrespective of its reference count) and * _free_event()'s put_task_struct(event->hw.target) will be a * use-after-free. * * Wait for all events to drop their context reference. */ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) { struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &perf_fops) { fput(file); return ERR_PTR(-EBADF); } return file; } const struct perf_event *perf_get_event(struct file *file) { if (file->f_op != &perf_fops) return ERR_PTR(-EINVAL); return file->private_data; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) return ERR_PTR(-EINVAL); return &event->attr; } int perf_allow_kernel(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); /* * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success * - NULL for orphaned events * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event *group_leader, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; /* * Instead of creating recursive hierarchies of events, * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ if (parent_event->parent) parent_event = parent_event->parent; child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, group_leader, parent_event, NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against * perf_event_release_kernel(), such that either we must observe * is_orphaned_event() or they will observe us on the child_list. */ mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } get_ctx(child_ctx); /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; if (parent_event->attr.freq) { u64 sample_period = parent_event->hw.sample_period; struct hw_perf_event *hwc = &child_event->hw; hwc->sample_period = sample_period; hwc->last_period = sample_period; local64_set(&hwc->period_left, sample_period); } child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes */ perf_event__header_size(child_event); perf_event__id_header_size(child_event); /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Link this into the parent event's child list */ list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); return child_event; } /* * Inherits an event group. * * This will quietly suppress orphaned events; !inherit_event() is not an error. * This matches with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event_context *child_ctx) { struct perf_event *leader; struct perf_event *sub; struct perf_event *child_ctr; leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); /* * @leader can be NULL here because of is_orphaned_event(). In this * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } if (leader) leader->group_generation = parent_event->group_generation; return 0; } /* * Creates the child task context and tries to inherit the event-group. * * Clears @inherited_all on !attr.inherited or error. Note that we'll leave * inherited_all set when we 'fail' to inherit an orphaned event; this is * consistent with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, u64 clone_flags, int *inherited_all) { struct perf_event_context *child_ctx; int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || /* Do not inherit if sigtrap and signal handlers were cleared. */ (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so * inherit events that have been marked for cloning. * First allocate and initialize a context for the * child. */ child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; child->perf_event_ctxp = child_ctx; } ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; return ret; } /* * Initialize the perf_event context in task_struct */ static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; unsigned long flags; int ret = 0; if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } /* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because * rotate_ctx() will change the list from interrupt context. */ raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. * * Note that if the parent is a clone, the holding of * parent_ctx->lock avoids it from being uncloned. */ cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; } get_ctx(child_ctx->parent_ctx); } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); put_ctx(parent_ctx); return ret; } /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); ret = perf_event_init_context(child, clone_flags); if (ret) { perf_event_free_task(child); return ret; } return 0; } static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); cpuctx->heap = cpuctx->heap_default; } } static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); WARN_ON(!hlist); rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; struct perf_event *event; raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } static void perf_event_clear_cpumask(unsigned int cpu) { int target[PERF_PMU_MAX_SCOPE]; unsigned int scope; struct pmu *pmu; cpumask_clear_cpu(cpu, perf_online_mask); for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); target[scope] = -1; if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) continue; if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) continue; target[scope] = cpumask_any_but(cpumask, cpu); if (target[scope] < nr_cpu_ids) cpumask_set_cpu(target[scope], pmu_cpumask); } /* migrate */ list_for_each_entry(pmu, &pmus, entry) { if (pmu->scope == PERF_PMU_SCOPE_NONE || WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) continue; if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); } } static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; // XXX simplify cpuctx->online mutex_lock(&pmus_lock); /* * Clear the cpumasks, and migrate to other CPUs if possible. * Must be invoked before the __perf_event_exit_context. */ perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); } #else static void perf_event_exit_cpu_context(int cpu) { } #endif static void perf_event_setup_cpumask(unsigned int cpu) { struct cpumask *pmu_cpumask; unsigned int scope; /* * Early boot stage, the cpumask hasn't been set yet. * The perf_online_<domain>_masks includes the first CPU of each domain. * Always unconditionally set the boot CPU for the perf_online_<domain>_masks. */ if (cpumask_empty(perf_online_mask)) { for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { pmu_cpumask = perf_scope_cpumask(scope); if (WARN_ON_ONCE(!pmu_cpumask)) continue; cpumask_set_cpu(cpu, pmu_cpumask); } goto end; } for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); pmu_cpumask = perf_scope_cpumask(scope); if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) continue; if (!cpumask_empty(cpumask) && cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) cpumask_set_cpu(cpu, pmu_cpumask); } end: cpumask_set_cpu(cpu, perf_online_mask); } int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); cpuctx->online = 1; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; } int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); return 0; } static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) { int cpu; for_each_online_cpu(cpu) perf_event_exit_cpu(cpu); return NOTIFY_OK; } /* * Run the perf reboot notifier at the very last possible moment so that * the generic watchdog code runs as long as possible. */ static struct notifier_block perf_reboot_notifier = { .notifier_call = perf_reboot, .priority = INT_MIN, }; void __init perf_event_init(void) { int ret; idr_init(&pmu_idr); perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. */ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) != 1024); } ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return 0; } EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { struct pmu *pmu; int ret; mutex_lock(&pmus_lock); ret = bus_register(&pmu_bus); if (ret) goto unlock; list_for_each_entry(pmu, &pmus, entry) { if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); } pmu_bus_running = 1; ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; } device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } return &jc->css; } static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); } static int perf_cgroup_css_online(struct cgroup_subsys_state *css) { perf_event_cgroup(css->cgroup); return 0; } static int __perf_cgroup_move(void *info) { struct task_struct *task = info; preempt_disable(); perf_cgroup_switch(task); preempt_enable(); return 0; } static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can * always be filtered by cgroup2 path as long as perf_event * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
55 6 49 55 46 45 12 32 57 1 1 56 55 9 46 13 40 51 59 6 6 50 17 37 55 59 27 2 3 23 26 26 27 27 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * @cookie: cookie to reset * * Update the file offset to the value specified by @offset if the given * offset is valid and it is not equal to the current file offset and * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; if (cookie) *cookie = 0; } return offset; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { return vfs_setpos_cookie(file, offset, maxsize, NULL); } EXPORT_SYMBOL(vfs_setpos); /** * must_set_pos - check whether f_pos has to be updated * @file: file to seek on * @offset: offset to use * @whence: type of seek operation * @eof: end of file * * Check whether f_pos needs to be updated and update @offset according * to @whence. * * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be * updated, and negative error code on failure. */ static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: *offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (*offset == 0) { *offset = file->f_pos; return 0; } break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)*offset >= eof) return -ENXIO; *offset = eof; break; } return 1; } /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { int ret; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; if (whence == SEEK_CUR) { /* * f_lock protects against read/modify/write race with * other SEEK_CURs. Note that parallel writes and reads * behave like SEEK_SET. */ guard(spinlock)(&file->f_lock); return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_llseek_cookie - versioned llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @cookie: cookie to update * * See generic_file_llseek for a general description and locking assumptions. * * In contrast to generic_file_llseek, this function also resets a * specified cookie to indicate a seek took place. */ loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie) { struct inode *inode = file->f_mapping->host; loff_t maxsize = inode->i_sb->s_maxbytes; loff_t eof = i_size_read(inode); int ret; if (WARN_ON_ONCE(!cookie)) return -EINVAL; /* * Require that this is only used for directories that guarantee * synchronization between readdir and seek so that an update to * @cookie is correctly synchronized with concurrent readdir. */ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) return -EINVAL; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; /* No need to hold f_lock because we know that f_pos_lock is held. */ if (whence == SEEK_CUR) return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); return vfs_setpos_cookie(file, offset, maxsize, cookie); } EXPORT_SYMBOL(generic_llseek_cookie); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) file->f_pos = offset; retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; CLASS(fd_pos, f)(fd); if (fd_empty(f)) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(fd_file(f), offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; CLASS(fd_pos, f)(fd); loff_t offset; if (fd_empty(f)) return -EBADF; if (whence > SEEK_MAX) return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PREAD) return vfs_read(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PWRITE) return vfs_write(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ CLASS(fd, in)(in_fd); if (fd_empty(in)) return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ CLASS(fd, out)(out_fd); if (fd_empty(out)) return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { if (pos >= max) return -EOVERFLOW; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (fd_file(in)->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(fd_file(out), true); if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { if (fd_file(out)->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(fd_file(in)); fsnotify_modify(fd_file(out)); fd_file(out)->f_pos = out_pos; if (ppos) *ppos = pos; else fd_file(in)->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; ssize_t ret = -EBADF; CLASS(fd, f_in)(fd_in); if (fd_empty(f_in)) return -EBADF; CLASS(fd, f_out)(fd_out); if (fd_empty(f_out)) return -EBADF; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } if (flags != 0) return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_in)->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_out)->f_pos = pos_out; } } return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) return -EINVAL; if (!is_power_of_2(len)) return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) return -EINVAL; if (!(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) goto freeskb; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_lock(); err = arp_req_delete(net, &r); rtnl_unlock(); break; case SIOCSARP: rtnl_lock(); err = arp_req_set(net, &r); rtnl_unlock(); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #include <linux/bits.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 extern int file_caps_enabled; typedef struct { u64 val; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kuid_t rootid; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; struct mnt_idmap; /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK (BIT_ULL(CAP_CHOWN) \ | BIT_ULL(CAP_MKNOD) \ | BIT_ULL(CAP_DAC_OVERRIDE) \ | BIT_ULL(CAP_DAC_READ_SEARCH) \ | BIT_ULL(CAP_FOWNER) \ | BIT_ULL(CAP_FSETID) \ | BIT_ULL(CAP_MAC_OVERRIDE)) #define CAP_VALID_MASK (BIT_ULL(CAP_LAST_CAP+1)-1) # define CAP_EMPTY_SET ((kernel_cap_t) { 0 }) # define CAP_FULL_SET ((kernel_cap_t) { CAP_VALID_MASK }) # define CAP_FS_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) }) # define CAP_NFSD_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) }) # define cap_clear(c) do { (c).val = 0; } while (0) #define cap_raise(c, flag) ((c).val |= BIT_ULL(flag)) #define cap_lower(c, flag) ((c).val &= ~BIT_ULL(flag)) #define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val | b.val }; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val & b.val }; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { return (kernel_cap_t) { a.val &~ drop.val }; } static inline bool cap_isclear(const kernel_cap_t a) { return !a.val; } static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b) { return a.val == b.val; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { return !(a.val & ~set.val); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { return cap_drop(a, CAP_FS_SET); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_FS_SET)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { return cap_drop(a, CAP_NFSD_SET); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET)); } #ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode); bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */
701 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 ARM Ltd. */ #ifndef __ASM_PGTABLE_PROT_H #define __ASM_PGTABLE_PROT_H #include <asm/memory.h> #include <asm/pgtable-hwdef.h> #include <linux/const.h> /* * Software defined PTE bits definition. */ #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) /* * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be * interpreted according to the HW layout by SW but any attempted HW access to * the address will result in a fault. pte_present() returns true. */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ #else #define PTE_UFFD_WP (_AT(pteval_t, 0)) #define PTE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define _PAGE_KERNEL (PROT_NORMAL) #define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN) #define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) #define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) #define _PAGE_READONLY (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #ifndef __ASSEMBLY__ #include <asm/cpufeature.h> #include <asm/pgtable-types.h> #include <asm/rsi.h> extern bool arm64_use_ng_mappings; extern unsigned long prot_ns_shared; #define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) #ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false #define PTE_MAYBE_SHARED PTE_SHARED #define PMD_MAYBE_SHARED PMD_SECT_S #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) #else static inline bool __pure lpa2_is_enabled(void) { return read_tcr() & TCR_DS; } #define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) #define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) #define PHYS_MASK_SHIFT (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48) #endif /* * Highest possible physical address supported. */ #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. */ #define PTE_MAYBE_GP (system_supports_bti_kernel() ? PTE_GP : 0) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_KERNEL_ROX) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT) #define PAGE_S2_MEMATTR(attr, has_fwb) \ ({ \ u64 __val; \ if (has_fwb) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ __val; \ }) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_SHARED) #define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC) #define PAGE_READONLY __pgprot(_PAGE_READONLY) #define PAGE_READONLY_EXEC __pgprot(_PAGE_READONLY_EXEC) #define PAGE_EXECONLY __pgprot(_PAGE_EXECONLY) #endif /* __ASSEMBLY__ */ #define pte_pi_index(pte) ( \ ((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \ ((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \ ((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \ ((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0))) /* * Page types used via Permission Indirection Extension (PIE). PIE uses * the USER, DBM, PXN and UXN bits to to generate an index which is used * to look up the actual permission in PIR_ELx and PIRE0_EL1. We define * combinations we use on non-PIE systems with the same encoding, for * convenience these are listed here as comments as are the unallocated * encodings. */ /* 0: PAGE_DEFAULT */ /* 1: PTE_USER */ /* 2: PTE_WRITE */ /* 3: PTE_WRITE | PTE_USER */ /* 4: PAGE_EXECONLY PTE_PXN */ /* 5: PAGE_READONLY_EXEC PTE_PXN | PTE_USER */ /* 6: PTE_PXN | PTE_WRITE */ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ /* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */ /* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */ /* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) #define PAGE_GCS __pgprot(_PAGE_GCS) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */
87 87 85 87 87 87 87 87 87 87 88 87 87 87 88 87 85 87 88 87 70 69 70 70 70 70 70 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/sysreg-sr.h> #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) { /* These registers are common with EL1 */ __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); /* * In VHE mode those registers are compatible between EL1 and EL2, * and the guest uses the _EL1 versions on the CPU naturally. * So we save them into their _EL2 versions here. * For nVHE mode we trap accesses to those registers, so our * _EL2 copy in sys_regs[] is always up-to-date and we don't need * to save anything here. */ if (vcpu_el2_e2h_is_set(vcpu)) { u64 val; /* * We don't save CPTR_EL2, as accesses to CPACR_EL1 * are always trapped, ensuring that the in-memory * copy is always up-to-date. A small blessing... */ __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); } /* * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where * the interesting CNTHCTL_EL2 bits live. So preserve these * bits when reading back the guest-visible value. */ val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; } __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); } static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) { u64 val; /* These registers are common with EL1 */ write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); if (vcpu_el2_e2h_is_set(vcpu)) { /* * In VHE mode those registers are compatible between * EL1 and EL2. */ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); } else { /* * CNTHCTL_EL2 only affects EL1 when running nVHE, so * no need to restore it. */ val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); write_sysreg_el1(val, SYS_SCTLR); val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); write_sysreg_el1(val, SYS_CPACR); val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); write_sysreg_el1(val, SYS_TTBR0); val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); write_sysreg_el1(val, SYS_TCR); } if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); } write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); } /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every * switch (sp_el0 is being dealt with in the assembly code). * tpidr_el0 and tpidrro_el0 only need to be switched when going * to host userspace or a different VCPU. EL1 registers only need to be * switched when potentially going to run a different VCPU. The latter two * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put. */ void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_save_common_state(ctxt); } NOKPROBE_SYMBOL(sysreg_save_host_state_vhe); void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_save_common_state(ctxt); __sysreg_save_el2_return_state(ctxt); } NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe); void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_restore_common_state(ctxt); } NOKPROBE_SYMBOL(sysreg_restore_host_state_vhe); void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_restore_common_state(ctxt); __sysreg_restore_el2_return_state(ctxt); } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * * @vcpu: The VCPU pointer * * Load system registers that do not affect the host's execution, for * example EL1 system registers on a VHE system where the host kernel * runs at EL2. This function is called from KVM's vcpu_load() function * and loading system register state early avoids having to load them on * every entry to the VM. */ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; u64 mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); /* * When running a normal EL1 guest, we only load a new vcpu * after a context switch, which imvolves a DSB, so all * speculative EL1&0 walks will have already completed. * If running NV, the vcpu may transition between vEL1 and * vEL2 without a context switch, so make sure we complete * those walks before loading a new context. */ if (vcpu_has_nv(vcpu)) dsb(nsh); /* * Load guest EL1 and user state * * We must restore the 32-bit state before the sysregs, thanks * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { /* * Use the guest hypervisor's VPIDR_EL2 when in a * nested state. The hardware value of MIDR_EL1 gets * restored on put. */ write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); /* * As we're restoring a nested guest, set the value * provided by the guest hypervisor. */ mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); } else { mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); } __sysreg_restore_el1_state(guest_ctxt, mpidr); } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } /** * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU * * @vcpu: The VCPU pointer * * Save guest system registers that do not affect the host's execution, for * example EL1 system registers on a VHE system where the host kernel * runs at EL2. This function is called from KVM's vcpu_put() function * and deferring saving system register state until we're no longer running the * VCPU avoids having to save them on every exit from the VM. */ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; host_ctxt = host_data_ptr(host_ctxt); if (unlikely(is_hyp_ctxt(vcpu))) __sysreg_save_vel2_state(vcpu); else __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); /* If leaving a nesting guest, restore MIDR_EL1 default view */ if (vcpu_has_nv(vcpu)) write_sysreg(read_cpuid_id(), vpidr_el2); vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); }
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } static inline int tick_sched_flag_test(struct tick_sched *ts, unsigned long flag) { return !!(ts->flags & flag); } static inline void tick_sched_flag_set(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags |= flag; } static inline void tick_sched_flag_clear(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags &= ~flag; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif WRITE_ONCE(tick_do_timer_cpu, cpu); tick_cpu = cpu; } /* Check if jiffies need an update */ if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask * * XXX given a task picks up the dependency on schedule(), should we * only care about tasks that are currently on the CPU instead of all * that are on the runqueue? * * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* * Read jiffies and the time when jiffies were updated last */ u64 get_jiffies_update(unsigned long *basej) { unsigned long basejiff; unsigned int seq; u64 basemono; do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); *basej = basejiff; return basemono; } /** * tick_nohz_next_event() - return the clock monotonic based next event * @ts: pointer to tick_sched struct * @cpu: CPU number * * Return: * *%0 - When the next event is a maximum of TICK_NSEC in the future * and the tick is not stopped yet * *%next_event - Next event based on clock monotonic */ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu != cpu && (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * Now the tick should be stopped definitely - so the timer base needs * to be marked idle as well to not miss a newly queued timer. */ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); if (expires > ts->timer_expires) { /* * This path could only happen when the first timer was removed * between calculating the possible sleep length and now (when * high resolution mode is not active, timer could also be a * hrtimer). * * We have to stick to the original calculated expiry value to * not stop the tick for too long with a shallow C-state (which * was programmed by cpuidle because of an early next expiration * value). */ expires = ts->timer_expires; } /* If the timer base is not idle, retain the not yet stopped tick. */ if (!timer_idle) return; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu == cpu) { WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } /* Skip reprogram of event if it's not changed */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_full_stop_tick(ts, cpu); else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(cpu_is_offline(cpu)); if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { int tick_cpu = READ_ONCE(tick_do_timer_cpu); /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run * * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled * * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. * * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. * * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * highres code. */ tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { tick_sched_flag_set(ts, TS_FLAG_HIGHRES); ts->sched_timer.function = tick_nohz_handler; } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts); } /* * Shut down the tick and make sure the CPU won't try to retake the timekeeping * duty before disabling IRQs in idle for the last time. */ void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static bool super_flags(const struct super_block *sb, unsigned int flags) { /* * Pairs with smp_store_release() in super_wake() and ensures * that we see @flags after we're woken. */ return smp_load_acquire(&sb->s_flags) & flags; } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: The function returns true if SB_BORN was set and with * s_umount held. The function returns false if SB_DYING was * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); /* Don't pointlessly acquire s_umount. */ if (super_flags(sb, SB_DYING)) return false; __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) { super_unlock(sb, excl); return false; } WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); return true; } /* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) goto fail; s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; s->s_shrink->batch = 1024; s->s_shrink->private_data = s; if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super(struct super_block *sb) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_excl(sb); if (locked) { if (atomic_inc_not_zero(&sb->s_active)) { put_super(sb); return true; } super_unlock_excl(sb); } wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL, "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; /* * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is * not set, as the filesystem is likely unprepared to handle it. * This can happen when fsconfig() is called from init_user_ns with * an fs_fd opened in another user namespace. */ if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); return ERR_PTR(-EPERM); } retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); shrinker_register(s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { if (sb->s_root) f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { if (sb->s_root) f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { bool locked; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ locked = super_lock(sb, excl); if (locked) { if (sb->s_root) return sb; super_unlock(sb, excl); } /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); break; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb) { bool locked = super_lock_excl(sb); if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } if (locked) super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { bool locked = super_lock_excl(sb); if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); return; } if (locked) super_unlock_excl(sb); } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock the superblock that is holder of the bdev. Returns the superblock * pointer if we successfully locked the superblock and it is alive. Otherwise * we return NULL and just unlock bdev->bd_holder_lock. * * The function must be called with bdev->bd_holder_lock and releases it. */ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); lockdep_assert_not_held(&bdev->bd_disk->open_mutex); /* Make sure sb doesn't go away from under us */ spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); locked = super_lock(sb, excl); /* * If the superblock wasn't already SB_DYING then we hold * s_umount and can safely drop our temporary reference. */ put_super(sb); if (!locked) return NULL; if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock(sb, excl); return NULL; } return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); } static struct super_block *get_bdev_super(struct block_device *bdev) { bool active = false; struct super_block *sb; sb = bdev_super_lock(bdev, true); if (sb) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } if (!active) return NULL; return sb; } /** * fs_bdev_freeze - freeze owning filesystem of block device * @bdev: block device * * Freeze the filesystem that owns this block device if it is still * active. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ static int fs_bdev_freeze(struct block_device *bdev) { struct super_block *sb; int error = 0; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else error = freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); return error; } /** * fs_bdev_thaw - thaw owning filesystem of block device * @bdev: block device * * Thaw the filesystem that owns this block device. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the thaw was successful zero is returned. If the thaw * failed a negative error code is returned. If this function * returns zero it doesn't mean that the filesystem is unfrozen * as it may have been frozen multiple times (kernel may hold a * freeze or might be frozen from other block devices). */ static int fs_bdev_thaw(struct block_device *bdev) { struct super_block *sb; int error; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); /* * The block device may have been frozen before it was claimed by a * filesystem. Concurrently another process might try to mount that * frozen block device and has temporarily claimed the block device for * that purpose causing a concurrent fs_bdev_thaw() to end up here. The * mounter is already about to abort mounting because they still saw an * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return * NULL in that case. */ sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else error = thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); deactivate_super(sb); return error; } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct file *bdev_file; struct block_device *bdev; bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev_file); } bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { bdev_fput(bdev_file); return -EACCES; } /* * It is enough to check bdev was not frozen before we set * s_bdev as freezing will wait until SB_BORN is set. */ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock * @flags: GET_TREE_BDEV_* flags */ int get_tree_bdev_flags(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { error = setup_bdev_super(s, fc->sb_flags, fc); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL_GPL(get_tree_bdev_flags); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { return get_tree_bdev_flags(fc, fill_super, 0); } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { error = setup_bdev_super(s, flags, NULL); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); bdev_fput(sb->s_bdev_file); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); int reconfigure_single(struct super_block *s, int flags, void *data) { struct fs_context *fc; int ret; /* The caller really need to be passing fc down into mount_single(), * then a chunk of this can be removed. [Bollocks -- AV] * Better yet, reconfiguration shouldn't happen, but rather the second * mount should be rejected if the parameters are not compatible. */ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); ret = parse_monolithic_mount_data(fc, data); if (ret < 0) goto out; ret = reconfigure_super(fc); out: put_fs_context(fc); return ret; } static int compare_single(struct super_block *s, void *p) { return 1; } struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (!error) s->s_flags |= SB_ACTIVE; } else { error = reconfigure_single(s, flags, data); } if (unlikely(error)) { deactivate_locked_super(s); return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) #define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) ++sb->s_writers.freeze_kcount; if (who & FREEZE_HOLDER_USERSPACE) ++sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) --sb->s_writers.freeze_kcount; if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) --sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_kcount == 0; if (who & FREEZE_HOLDER_USERSPACE) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_ucount == 0; return false; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * A filesystem may hold multiple devices and thus a filesystems may be * frozen through the block layer via multiple block devices. In this * case the request is marked as being allowed to nest by passing * FREEZE_MAY_NEST. The filesystem remains frozen until all block * devices are unfrozen. If multiple freezes are attempted without * FREEZE_MAY_NEST -EBUSY will be returned. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while freezing!"); return -EINVAL; } atomic_inc(&sb->s_active); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (may_freeze(sb, who)) ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); else ret = -EBUSY; /* All freezers share a single active reference. */ deactivate_locked_super(sb); return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (sb_rdonly(sb)) { /* Nothing to do really... */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error = -EINVAL; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; /* * All freezers share a single active reference. * So just unlock in case there are any left. */ if (freeze_dec(sb, who)) goto out_unlock; if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { pr_err("VFS: Filesystem thaw failed\n"); freeze_inc(sb, who); lockdep_sb_freeze_release(sb); goto out_unlock; } } sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out_deactivate: deactivate_locked_super(sb); return 0; out_unlock: super_unlock_excl(sb); return error; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed * * A filesystem may hold multiple devices and thus a filesystems may * have been frozen through the block layer via multiple block devices. * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while thawing!"); return -EINVAL; } return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/busy_poll.h> #include <net/net_namespace.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include "dev.h" #include "devmem.h" #include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } rtnl_unlock(); return err; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; if (!napi->dev->up) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, napi_defer_hard_irqs)) goto nla_put_failure; irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, irq_suspend_timeout)) goto nla_put_failure; gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } if (err) { goto err_free_msg; } else if (!rsp->len) { err = -ENOENT; goto err_free_msg; } return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; unsigned int prev_id; int err = 0; if (!netdev->up) return err; prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (napi->napi_id < MIN_NAPI_ID) continue; /* Dump continuation below depends on the list being sorted */ WARN_ON_ONCE(napi->napi_id >= prev_id); prev_id = napi->napi_id; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock(net, ifindex); if (netdev) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); netdev_unlock(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } return err; } static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); } if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); } if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); } return 0; } int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; unsigned int napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } return err; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; binding = rxq->mp_params.mp_priv; if (binding && nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, txq->napi->napi_id)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err; if (!netdev->up) return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); netdev_unlock(netdev); } else { err = -ENODEV; } rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; if (!netdev->up) return err; for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; } for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = netdev_get_by_index_lock(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); netdev_unlock(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } rtnl_unlock(); return err; } #define NETDEV_STAT_NOT_SET (~0ULL) static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) { const u64 *add = _add; u64 *sum = _sum; while (size) { if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) *sum += *add; sum++; add++; size -= 8; } } static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) { if (value == NETDEV_STAT_NOT_SET) return 0; return nla_put_uint(rsp, attr_id, value); } static int netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, u32 q_type, int i, const struct genl_info *info) { const struct netdev_stat_ops *ops = netdev->stat_ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: memset(&rx, 0xff, sizeof(rx)); ops->get_queue_stats_rx(netdev, i, &rx); if (!memchr_inv(&rx, 0xff, sizeof(rx))) goto nla_cancel; if (netdev_nl_stats_write_rx(rsp, &rx)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: memset(&tx, 0xff, sizeof(tx)); ops->get_queue_stats_tx(netdev, i, &tx); if (!memchr_inv(&tx, 0xff, sizeof(tx))) goto nla_cancel; if (netdev_nl_stats_write_tx(rsp, &tx)) goto nla_put_failure; break; } genlmsg_end(rsp, hdr); return 0; nla_cancel: genlmsg_cancel(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { const struct netdev_stat_ops *ops = netdev->stat_ops; int i, err; if (!(netdev->flags & IFF_UP)) return 0; i = ctx->rxq_idx; while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, i, info); if (err) return err; ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, i, info); if (err) return err; ctx->txq_idx = ++i; } ctx->rxq_idx = 0; ctx->txq_idx = 0; return 0; } static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { struct netdev_queue_stats_rx rx_sum, rx; struct netdev_queue_stats_tx tx_sum, tx; const struct netdev_stat_ops *ops; void *hdr; int i; ops = netdev->stat_ops; /* Netdev can't guarantee any complete counters */ if (!ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; for (i = 0; i < netdev->real_num_rx_queues; i++) { memset(&rx, 0xff, sizeof(rx)); if (ops->get_queue_stats_rx) ops->get_queue_stats_rx(netdev, i, &rx); netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); } for (i = 0; i < netdev->real_num_tx_queues; i++) { memset(&tx, 0xff, sizeof(tx)); if (ops->get_queue_stats_tx) ops->get_queue_stats_tx(netdev, i, &tx); netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); } if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, struct sk_buff *skb, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { if (!netdev->stat_ops) return 0; switch (scope) { case 0: return netdev_nl_stats_by_netdev(netdev, skb, info); case NETDEV_QSTATS_SCOPE_QUEUE: return netdev_nl_stats_by_queue(netdev, skb, info, ctx); } return -EINVAL; /* Should not happen, per netlink policy */ } int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; unsigned int ifindex; unsigned int scope; int err = 0; scope = 0; if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); ifindex = 0; if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev && netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); err = netdev ? -EOPNOTSUPP : -ENODEV; } } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); if (err < 0) break; } } rtnl_unlock(); return err; } int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct net_devmem_dmabuf_binding *binding; struct list_head *sock_binding_list; u32 ifindex, dmabuf_fd, rxq_idx; struct net_device *netdev; struct sk_buff *rsp; struct nlattr *attr; int rem, err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); sock_binding_list = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(sock_binding_list)) return PTR_ERR(sock_binding_list); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (!netdev || !netif_device_present(netdev)) { err = -ENODEV; goto err_unlock; } if (dev_xdp_prog_count(netdev)) { NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); err = -EEXIST; goto err_unlock; } binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; } nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { err = nla_parse_nested( tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, netdev_queue_id_nl_policy, info->extack); if (err < 0) goto err_unbind; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { err = -EINVAL; goto err_unbind; } if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); err = -EINVAL; goto err_unbind; } rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) goto err_unbind; } list_add(&binding->list, sock_binding_list); nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); err = genlmsg_reply(rsp, info); if (err) goto err_unbind; rtnl_unlock(); return 0; err_unbind: net_devmem_unbind_dmabuf(binding); err_unlock: rtnl_unlock(); err_genlmsg_free: nlmsg_free(rsp); return err; } void netdev_nl_sock_priv_init(struct list_head *priv) { INIT_LIST_HEAD(priv); } void netdev_nl_sock_priv_destroy(struct list_head *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; list_for_each_entry_safe(binding, temp, priv, list) { rtnl_lock(); net_devmem_unbind_dmabuf(binding); rtnl_unlock(); } } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); break; case NETDEV_UNREGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init);
411 89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_andnot_bit - find N'th set bit in 2 memory regions, * flipping bits in 2nd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_andnot_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */
175 175 175 175 175 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/fixmap.h> #include <asm/insn.h> #include <asm/kprobes.h> #include <asm/text-patching.h> #include <asm/sections.h> static DEFINE_RAW_SPINLOCK(patch_lock); static bool is_exit_text(unsigned long addr) { /* discarded with init text/data */ return system_state < SYSTEM_RUNNING && addr >= (unsigned long)__exittext_begin && addr < (unsigned long)__exittext_end; } static bool is_image_text(unsigned long addr) { return core_kernel_text(addr) || is_exit_text(addr); } static void __kprobes *patch_map(void *addr, int fixmap) { phys_addr_t phys; if (is_image_text((unsigned long)addr)) { phys = __pa_symbol(addr); } else { struct page *page = vmalloc_to_page(addr); BUG_ON(!page); phys = page_to_phys(page) + offset_in_page(addr); } return (void *)set_fixmap_offset(fixmap, phys); } static void __kprobes patch_unmap(int fixmap) { clear_fixmap(fixmap); } /* * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always * little-endian. */ int __kprobes aarch64_insn_read(void *addr, u32 *insnp) { int ret; __le32 val; ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE); if (!ret) *insnp = le32_to_cpu(val); return ret; } static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) { void *waddr = addr; unsigned long flags = 0; int ret; raw_spin_lock_irqsave(&patch_lock, flags); waddr = patch_map(addr, FIX_TEXT_POKE0); ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); patch_unmap(FIX_TEXT_POKE0); raw_spin_unlock_irqrestore(&patch_lock, flags); return ret; } int __kprobes aarch64_insn_write(void *addr, u32 insn) { return __aarch64_insn_write(addr, cpu_to_le32(insn)); } noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) { u64 *waddr; unsigned long flags; int ret; raw_spin_lock_irqsave(&patch_lock, flags); waddr = patch_map(addr, FIX_TEXT_POKE0); ret = copy_to_kernel_nofault(waddr, &val, sizeof(val)); patch_unmap(FIX_TEXT_POKE0); raw_spin_unlock_irqrestore(&patch_lock, flags); return ret; } typedef void text_poke_f(void *dst, void *src, size_t patched, size_t len); static void *__text_poke(text_poke_f func, void *addr, void *src, size_t len) { unsigned long flags; size_t patched = 0; size_t size; void *waddr; void *ptr; raw_spin_lock_irqsave(&patch_lock, flags); while (patched < len) { ptr = addr + patched; size = min_t(size_t, PAGE_SIZE - offset_in_page(ptr), len - patched); waddr = patch_map(ptr, FIX_TEXT_POKE0); func(waddr, src, patched, size); patch_unmap(FIX_TEXT_POKE0); patched += size; } raw_spin_unlock_irqrestore(&patch_lock, flags); flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); return addr; } static void text_poke_memcpy(void *dst, void *src, size_t patched, size_t len) { copy_to_kernel_nofault(dst, src + patched, len); } static void text_poke_memset(void *dst, void *src, size_t patched, size_t len) { u32 c = *(u32 *)src; memset32(dst, c, len / 4); } /** * aarch64_insn_copy - Copy instructions into (an unused part of) RX memory * @dst: address to modify * @src: source of the copy * @len: length to copy * * Useful for JITs to dump new code blocks into unused regions of RX memory. */ noinstr void *aarch64_insn_copy(void *dst, void *src, size_t len) { /* A64 instructions must be word aligned */ if ((uintptr_t)dst & 0x3) return NULL; return __text_poke(text_poke_memcpy, dst, src, len); } /** * aarch64_insn_set - memset for RX memory regions. * @dst: address to modify * @insn: value to set * @len: length of memory region. * * Useful for JITs to fill regions of RX memory with illegal instructions. */ noinstr void *aarch64_insn_set(void *dst, u32 insn, size_t len) { if ((uintptr_t)dst & 0x3) return NULL; return __text_poke(text_poke_memset, dst, &insn, len); } int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) { u32 *tp = addr; int ret; /* A64 instructions must be word aligned */ if ((uintptr_t)tp & 0x3) return -EINVAL; ret = aarch64_insn_write(tp, insn); if (ret == 0) caches_clean_inval_pou((uintptr_t)tp, (uintptr_t)tp + AARCH64_INSN_SIZE); return ret; } struct aarch64_insn_patch { void **text_addrs; u32 *new_insns; int insn_cnt; atomic_t cpu_count; }; static int __kprobes aarch64_insn_patch_text_cb(void *arg) { int i, ret = 0; struct aarch64_insn_patch *pp = arg; /* The last CPU becomes master */ if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { for (i = 0; ret == 0 && i < pp->insn_cnt; i++) ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], pp->new_insns[i]); /* Notify other processors with an additional increment. */ atomic_inc(&pp->cpu_count); } else { while (atomic_read(&pp->cpu_count) <= num_online_cpus()) cpu_relax(); isb(); } return ret; } int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) { struct aarch64_insn_patch patch = { .text_addrs = addrs, .new_insns = insns, .insn_cnt = cnt, .cpu_count = ATOMIC_INIT(0), }; if (cnt <= 0) return -EINVAL; return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, cpu_online_mask); }
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct canfd_frame and; struct canfd_frame or; struct canfd_frame xor; struct canfd_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct canfd_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); } static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); } static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); } static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) { memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } /* retrieve valid CC DLC value and store it into 'len' */ static void mod_retrieve_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* len8_dlc is only valid if len == CAN_MAX_DLEN */ if (ccf->len != CAN_MAX_DLEN) return; /* do we have a valid len8_dlc value from 9 .. 15 ? */ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) ccf->len = ccf->len8_dlc; } /* convert valid CC DLC value in 'len' into struct can_frame elements */ static void mod_store_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* clear potential leftovers */ ccf->len8_dlc = 0; /* plain data length 0 .. 8 - that was easy */ if (ccf->len <= CAN_MAX_DLEN) return; /* potentially broken values are caught in can_can_gw_rcv() */ if (ccf->len > CAN_MAX_RAW_DLC) return; /* we have a valid dlc value from 9 .. 15 in ccf->len */ ccf->len8_dlc = ccf->len; ccf->len = CAN_MAX_DLEN; } static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_and_len(cf, mod); mod_store_ccdlc(cf); } static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_or_len(cf, mod); mod_store_ccdlc(cf); } static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_xor_len(cf, mod); mod_store_ccdlc(cf); } static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_set_len(cf, mod); mod_store_ccdlc(cf); } static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 2 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->flags = src->flags; dst->len = src->len; memcpy(dst->data, src->data, CANFD_MAX_DLEN); } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) { s8 dlen = CAN_MAX_DLEN; if (r->flags & CGW_FLAGS_CAN_FD) dlen = CANFD_MAX_DLEN; /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr >= -dlen && fr < dlen && to >= -dlen && to < dlen && re >= -dlen && re < dlen) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_len) { if (idx < 0) return rx_len + idx; else return idx; } static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->len); int to = calc_idx(xor->to_idx, cf->len); int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * * The Controller Area Network controllers only accept CAN frames with * correct CRCs - which are not visible in the controller registers. * According to skbuff.h documentation the csum_start element for IP * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. * Only CAN skbs can be processed here which already have this property. */ #define cgw_hops(skb) ((skb)->csum_start) BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); if (cgw_hops(skb) >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ if (gwj->mod.modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && cgw_hops(nskb) == 1) cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->len > max_len) { /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } /* check for checksum updates */ if (gwj->mod.csumfunc.crc8) (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); kmem_cache_free(cgw_cache, gwj); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } if (gwj->mod.uid) { if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) goto cancel; } if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) goto cancel; } if (gwj->mod.csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &gwj->mod.csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX + 1]; struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (r->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (tb[CGW_FDMOD_AND]) { nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_and_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_and_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_fddata; } if (tb[CGW_FDMOD_OR]) { nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_or_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_or_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_fddata; } if (tb[CGW_FDMOD_XOR]) { nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_xor_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_xor_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_fddata; } if (tb[CGW_FDMOD_SET]) { nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_set_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_set_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_fddata; } } else { struct cgw_frame_mod mb; if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; if (mod.uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { if (gwj->mod.uid != mod.uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) return -EINVAL; /* update modifications with disabled softirq & quit */ local_bh_disable(); memcpy(&gwj->mod, &mod, sizeof(mod)); local_bh_enable(); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) return -ENODEV; gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ memcpy(&gwj->mod, &mod, sizeof(mod)); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; /* is sending the skb back to the incoming interface intended? */ if (gwj->src.dev == gwj->dst.dev && !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { err = -EINVAL; goto out; } ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) kmem_cache_free(cgw_cache, gwj); return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; /* we have a match when uid is enabled and identical */ if (gwj->mod.uid || mod.uid) { if (gwj->mod.uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit_batch = cangw_pernet_exit_batch, }; static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE, .doit = cgw_create_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE, .doit = cgw_remove_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE, .dumpit = cgw_dump_jobs}, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway - max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(&notifier); if (ret) goto out_register_notifier; ret = rtnl_register_many(cgw_rtnl_msg_handlers); if (ret) goto out_rtnl_register; return 0; out_rtnl_register: unregister_netdevice_notifier(&notifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(&notifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit);
245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: page = find_lock_page(mapping, offset); if (!page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(page); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)page_address(page); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = page; ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) { unsigned long len = vma->vm_end - vma->vm_start; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap = secretmem_mmap, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; } file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } file->f_flags |= O_LARGEFILE; atomic_inc(&secretmem_users); fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } fs_initcall(secretmem_init);
155 235 235 163 165 234 165 163 163 235 1 1 1 1 20 20 12 18 20 18 12 12 20 20 175 175 173 175 175 174 175 175 175 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 // SPDX-License-Identifier: GPL-2.0-only /* * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> #include <asm/sections.h> /* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) { mutex_lock(&jump_label_mutex); } void jump_label_unlock(void) { mutex_unlock(&jump_label_mutex); } static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; const struct jump_entry *jeb = b; /* * Entrires are sorted by key. */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; /* * In the batching mode, entries should also be sorted by the code * inside the already sorted list of entries, enabling a bsearch in * the vector. */ if (jump_entry_code(jea) < jump_entry_code(jeb)) return -1; if (jump_entry_code(jea) > jump_entry_code(jeb)) return 1; return 0; } static void jump_label_swap(void *a, void *b, int size) { long delta = (unsigned long)a - (unsigned long)b; struct jump_entry *jea = a; struct jump_entry *jeb = b; struct jump_entry tmp = *jea; jea->code = jeb->code - delta; jea->target = jeb->target - delta; jea->key = jeb->key - delta; jeb->code = tmp.code + delta; jeb->target = tmp.target + delta; jeb->key = tmp.key + delta; } static void jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; void *swapfn = NULL; if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE)) swapfn = jump_label_swap; size = (((unsigned long)stop - (unsigned long)start) / sizeof(struct jump_entry)); sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn); } static void jump_label_update(struct static_key *key); /* * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { /* * -1 means the first static_key_slow_inc() is in progress. * static_key_enabled() must return true, so return 1 here. */ int n = atomic_read(&key->enabled); return n >= 0 ? n : 1; } EXPORT_SYMBOL_GPL(static_key_count); /* * static_key_fast_inc_not_disabled - adds a user for a static key * @key: static key that must be already enabled * * The caller must make sure that the static key can't get disabled while * in this function. It doesn't patch jump labels, only adds a user to * an already enabled static key. * * Returns true if the increment was done. Unlike refcount_t the ref counter * is not saturated, but will fail to increment on overflow. */ bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Negative key->enabled has a special meaning: it sends * static_key_slow_inc/dec() down the slow path, and it is non-zero * so it counts as "enabled" in jump_label_update(). * * The INT_MAX overflow condition is either used by the networking * code to reset or detected in the slow path of * static_key_slow_inc_cpuslocked(). */ v = atomic_read(&key->enabled); do { if (v <= 0 || v == INT_MAX) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); bool static_key_slow_inc_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); /* * Careful if we get concurrent static_key_slow_inc/dec() calls; * later calls must wait for the first one to _finish_ the * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. */ if (static_key_fast_inc_not_disabled(key)) return true; guard(mutex)(&jump_label_mutex); /* Try to mark it as 'enabling in progress. */ if (!atomic_cmpxchg(&key->enabled, 0, -1)) { jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); } else { /* * While holding the mutex this should never observe * anything else than a value >= 1 and succeed */ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) return false; } return true; } bool static_key_slow_inc(struct static_key *key) { bool ret; cpus_read_lock(); ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); /* * See static_key_slow_inc(). */ atomic_set_release(&key->enabled, 1); } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); void static_key_enable(struct static_key *key) { cpus_read_lock(); static_key_enable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } jump_label_lock(); if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); void static_key_disable(struct static_key *key) { cpus_read_lock(); static_key_disable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable); static bool static_key_dec_not_one(struct static_key *key) { int v; /* * Go into the slow path if key::enabled is less than or equal than * one. One is valid to shut down the key, anything less than one * is an imbalance, which is handled at the call site. * * That includes the special case of '-1' which is set in * static_key_slow_inc_cpuslocked(), but that's harmless as it is * fully serialized in the slow path below. By the time this task * acquires the jump label lock the value is back to one and the * retry under the lock must succeed. */ v = atomic_read(&key->enabled); do { /* * Warn about the '-1' case though; since that means a * decrement is concurrent with a first (0->1) increment. IOW * people are trying to disable something that wasn't yet fully * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); /* * Warn about underflow, and lie about success in an attempt to * not make things worse. */ if (WARN_ON_ONCE(v == 0)) return true; if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); return true; } static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); int val; if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); val = atomic_read(&key->enabled); /* * It should be impossible to observe -1 with jump_label_mutex held, * see static_key_slow_inc_cpuslocked(). */ if (WARN_ON_ONCE(val == -1)) return; /* * Cannot already be 0, something went sideways. */ if (WARN_ON_ONCE(val == 0)) return; if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); } static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); __static_key_slow_dec(&key->key); } EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec_cpuslocked(key); } void __static_key_slow_dec_deferred(struct static_key *key, struct delayed_work *work, unsigned long timeout) { STATIC_KEY_CHECK_USE(key); if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); } EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); flush_delayed_work(work); } EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; } static int __jump_label_text_reserved(struct jump_entry *iter_start, struct jump_entry *iter_stop, void *start, void *end, bool init) { struct jump_entry *iter; iter = iter_start; while (iter < iter_stop) { if (init || !jump_entry_is_init(iter)) { if (addr_conflict(iter, start, end)) return 1; } iter++; } return 0; } #ifndef arch_jump_label_transform_static static void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { /* nothing to do on most architectures */ } #endif static inline struct jump_entry *static_key_entries(struct static_key *key) { WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { return key->type & JUMP_TYPE_TRUE; } static inline bool static_key_linked(struct static_key *key) { return key->type & JUMP_TYPE_LINKED; } static inline void static_key_clear_linked(struct static_key *key) { key->type &= ~JUMP_TYPE_LINKED; } static inline void static_key_set_linked(struct static_key *key) { key->type |= JUMP_TYPE_LINKED; } /*** * A 'struct static_key' uses a union such that it either points directly * to a table of 'struct jump_entry' or to a linked list of modules which in * turn point to 'struct jump_entry' tables. * * The two lower bits of the pointer are used to keep track of which pointer * type is in use and to store the initial branch direction, we use an access * function which preserves these bits. */ static void static_key_set_entries(struct static_key *key, struct jump_entry *entries) { unsigned long type; WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->entries = entries; key->type |= type; } static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool enabled = static_key_enabled(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return enabled ^ branch; } static bool jump_label_can_update(struct jump_entry *entry, bool init) { /* * Cannot update code that was in an init text area. */ if (!init && jump_entry_is_init(entry)) return false; if (!kernel_text_address(jump_entry_code(entry))) { /* * This skips patching built-in __exit, which * is part of init_section_contains() but is * not part of kernel_text_address(). * * Skipping built-in __exit is fine since it * will never be executed. */ WARN_ONCE(!jump_entry_is_init(entry), "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); return false; } return true; } #ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (jump_label_can_update(entry, init)) arch_jump_label_transform(entry, jump_label_type(entry)); } } #else static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (!jump_label_can_update(entry, init)) continue; if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { /* * Queue is full: Apply the current queue and try again. */ arch_jump_label_transform_apply(); BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } arch_jump_label_transform_apply(); } #endif void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct static_key *key = NULL; struct jump_entry *iter; /* * Since we are initializing the static_key.enabled field with * with the 'raw' int values (to avoid pulling in atomic.h) in * jump_label.h, let's make sure that is safe. There are only two * cases to check since we initialize to 0 or 1. */ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); if (static_key_initialized) return; cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); in_init = init_section_contains((void *)jump_entry_code(iter), 1); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); cpus_read_unlock(); } static inline bool static_key_sealed(struct static_key *key) { return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); } static inline void static_key_seal(struct static_key *key) { unsigned long type = key->type & JUMP_TYPE_TRUE; key->type = JUMP_TYPE_LINKED | type; } void jump_label_init_ro(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; if (WARN_ON_ONCE(!static_key_initialized)) return; cpus_read_lock(); jump_label_lock(); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk = jump_entry_key(iter); if (!is_kernel_ro_after_init((unsigned long)iterk)) continue; if (static_key_sealed(iterk)) continue; static_key_seal(iterk); } jump_label_unlock(); cpus_read_unlock(); } #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return type ^ branch; } struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; static inline struct static_key_mod *static_key_mod(struct static_key *key) { WARN_ON_ONCE(!static_key_linked(key)); return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); } /*** * key->type and key->next are the same via union. * This sets key->next and preserves the type bits. * * See additional comments above static_key_set_entries(). */ static void static_key_set_mod(struct static_key *key, struct static_key_mod *mod) { unsigned long type; WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->next = mod; key->type |= type; } static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; int ret; preempt_disable(); mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); if (!try_module_get(mod)) mod = NULL; preempt_enable(); if (!mod) return 0; ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end, mod->state == MODULE_STATE_COMING); module_put(mod); return ret; } static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; for (mod = static_key_mod(key); mod; mod = mod->next) { struct jump_entry *stop; struct module *m; /* * NULL if the static_key is defined in a module * that does not use it */ if (!mod->entries) continue; m = mod->mod; if (!m) stop = __stop___jump_table; else stop = m->jump_entries + m->num_jump_entries; __jump_label_update(key, mod->entries, stop, m && m->state == MODULE_STATE_COMING); } } static int jump_label_add_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) return 0; jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; in_init = within_module_init(jump_entry_code(iter), mod); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; if (within_module((unsigned long)key, mod)) { static_key_set_entries(key, iter); continue; } /* * If the key was sealed at init, then there's no need to keep a * reference to its module entries - just patch them now and be * done with it. */ if (static_key_sealed(key)) goto do_poke; jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; if (!static_key_linked(key)) { jlm2 = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm2) { kfree(jlm); return -ENOMEM; } preempt_disable(); jlm2->mod = __module_address((unsigned long)key); preempt_enable(); jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); static_key_set_linked(key); } jlm->mod = mod; jlm->entries = iter; jlm->next = static_key_mod(key); static_key_set_mod(key, jlm); static_key_set_linked(key); /* Only update if we've changed from our initial state */ do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } return 0; } static void jump_label_del_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (jump_entry_key(iter) == key) continue; key = jump_entry_key(iter); if (within_module((unsigned long)key, mod)) continue; /* No @jlm allocated because key was sealed at init. */ if (static_key_sealed(key)) continue; /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; prev = &key->next; jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } /* No memory during module load */ if (WARN_ON(!jlm)) continue; if (prev == &key->next) static_key_set_mod(key, jlm->next); else *prev = jlm->next; kfree(jlm); jlm = static_key_mod(key); /* if only one etry is left, fold it back into the static_key */ if (jlm->next == NULL) { static_key_set_entries(key, jlm->entries); static_key_clear_linked(key); kfree(jlm); } } } static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; cpus_read_lock(); jump_label_lock(); switch (val) { case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; case MODULE_STATE_GOING: jump_label_del_module(mod); break; } jump_label_unlock(); cpus_read_unlock(); return notifier_from_errno(ret); } static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ /*** * jump_label_text_reserved - check if addr range is reserved * @start: start text addr * @end: end text addr * * checks if the text addr located between @start and @end * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ int jump_label_text_reserved(void *start, void *end) { bool init = system_state < SYSTEM_RUNNING; int ret = __jump_label_text_reserved(__start___jump_table, __stop___jump_table, start, end, init); if (ret) return ret; #ifdef CONFIG_MODULES ret = __jump_label_mod_text_reserved(start, end); #endif return ret; } static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; if (static_key_linked(key)) { __jump_label_mod_update(key); return; } preempt_disable(); mod = __module_address((unsigned long)key); if (mod) { stop = mod->jump_entries + mod->num_jump_entries; init = mod->state == MODULE_STATE_COMING; } preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop, init); } #ifdef CONFIG_STATIC_KEYS_SELFTEST static DEFINE_STATIC_KEY_TRUE(sk_true); static DEFINE_STATIC_KEY_FALSE(sk_false); static __init int jump_label_test(void) { int i; for (i = 0; i < 2; i++) { WARN_ON(static_key_enabled(&sk_true.key) != true); WARN_ON(static_key_enabled(&sk_false.key) != false); WARN_ON(!static_branch_likely(&sk_true)); WARN_ON(!static_branch_unlikely(&sk_true)); WARN_ON(static_branch_likely(&sk_false)); WARN_ON(static_branch_unlikely(&sk_false)); static_branch_disable(&sk_true); static_branch_enable(&sk_false); WARN_ON(static_key_enabled(&sk_true.key) == true); WARN_ON(static_key_enabled(&sk_false.key) == false); WARN_ON(static_branch_likely(&sk_true)); WARN_ON(static_branch_unlikely(&sk_true)); WARN_ON(!static_branch_likely(&sk_false)); WARN_ON(!static_branch_unlikely(&sk_false)); static_branch_enable(&sk_true); static_branch_disable(&sk_false); } return 0; } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #ifndef RXE_VERBS_H #define RXE_VERBS_H #include <linux/interrupt.h> #include <linux/workqueue.h> #include "rxe_pool.h" #include "rxe_task.h" #include "rxe_hw_counters.h" static inline int pkey_match(u16 key1, u16 key2) { return (((key1 & 0x7fff) != 0) && ((key1 & 0x7fff) == (key2 & 0x7fff)) && ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0; } /* Return >0 if psn_a > psn_b * 0 if psn_a == psn_b * <0 if psn_a < psn_b */ static inline int psn_compare(u32 psn_a, u32 psn_b) { s32 diff; diff = (psn_a - psn_b) << 8; return diff; } struct rxe_ucontext { struct ib_ucontext ibuc; struct rxe_pool_elem elem; }; struct rxe_pd { struct ib_pd ibpd; struct rxe_pool_elem elem; }; struct rxe_ah { struct ib_ah ibah; struct rxe_pool_elem elem; struct rxe_av av; bool is_user; int ah_num; }; struct rxe_cqe { union { struct ib_wc ibwc; struct ib_uverbs_wc uibwc; }; }; struct rxe_cq { struct ib_cq ibcq; struct rxe_pool_elem elem; struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; bool is_user; atomic_t num_wq; }; enum wqe_state { wqe_state_posted, wqe_state_processing, wqe_state_pending, wqe_state_done, wqe_state_error, }; struct rxe_sq { int max_wr; int max_sge; int max_inline; spinlock_t sq_lock; /* guard queue */ struct rxe_queue *queue; }; struct rxe_rq { int max_wr; int max_sge; spinlock_t producer_lock; /* guard queue producer */ spinlock_t consumer_lock; /* guard queue consumer */ struct rxe_queue *queue; }; struct rxe_srq { struct ib_srq ibsrq; struct rxe_pool_elem elem; struct rxe_pd *pd; struct rxe_rq rq; u32 srq_num; int limit; int error; }; struct rxe_req_info { int wqe_index; u32 psn; int opcode; atomic_t rd_atomic; int wait_fence; int need_rd_atomic; int wait_psn; int need_retry; int wait_for_rnr_timer; int noack_pkts; int again; }; struct rxe_comp_info { u32 psn; int opcode; int timeout; int timeout_retry; int started_retry; u32 retry_cnt; u32 rnr_retry; }; enum rdatm_res_state { rdatm_res_state_next, rdatm_res_state_new, rdatm_res_state_replay, }; struct resp_res { int type; int replay; u32 first_psn; u32 last_psn; u32 cur_psn; enum rdatm_res_state state; union { struct { u64 orig_val; } atomic; struct { u64 va_org; u32 rkey; u32 length; u64 va; u32 resid; } read; struct { u32 length; u64 va; u8 type; u8 level; } flush; }; }; struct rxe_resp_info { u32 msn; u32 psn; u32 ack_psn; int opcode; int drop_msg; int goto_error; int sent_psn_nak; enum ib_wc_status status; u8 aeth_syndrome; /* Receive only */ struct rxe_recv_wqe *wqe; /* RDMA read / atomic only */ u64 va; u64 offset; struct rxe_mr *mr; u32 resid; u32 rkey; u32 length; /* SRQ only */ struct { struct rxe_recv_wqe wqe; struct ib_sge sge[RXE_MAX_SGE]; } srq_wqe; /* Responder resources. It's a circular list where the oldest * resource is dropped first. */ struct resp_res *resources; unsigned int res_head; unsigned int res_tail; struct resp_res *res; }; struct rxe_qp { struct ib_qp ibqp; struct rxe_pool_elem elem; struct ib_qp_attr attr; unsigned int valid; unsigned int mtu; bool is_user; struct rxe_pd *pd; struct rxe_srq *srq; struct rxe_cq *scq; struct rxe_cq *rcq; enum ib_sig_type sq_sig_type; struct rxe_sq sq; struct rxe_rq rq; struct socket *sk; u32 dst_cookie; u16 src_port; struct rxe_av pri_av; struct rxe_av alt_av; atomic_t mcg_num; struct sk_buff_head req_pkts; struct sk_buff_head resp_pkts; struct rxe_task send_task; struct rxe_task recv_task; struct rxe_req_info req; struct rxe_comp_info comp; struct rxe_resp_info resp; atomic_t ssn; atomic_t skb_out; int need_req_skb; /* Timer for retranmitting packet when ACKs have been lost. RC * only. The requester sets it when it is not already * started. The responder resets it whenever an ack is * received. */ struct timer_list retrans_timer; u64 qp_timeout_jiffies; /* Timer for handling RNR NAKS. */ struct timer_list rnr_nak_timer; spinlock_t state_lock; /* guard requester and completer */ struct execute_work cleanup_work; }; enum { RXE_ACCESS_REMOTE = IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC, RXE_ACCESS_SUPPORTED_MR = RXE_ACCESS_REMOTE | IB_ACCESS_LOCAL_WRITE | IB_ACCESS_MW_BIND | IB_ACCESS_ON_DEMAND | IB_ACCESS_FLUSH_GLOBAL | IB_ACCESS_FLUSH_PERSISTENT | IB_ACCESS_OPTIONAL, RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR, RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR | IB_ZERO_BASED, }; enum rxe_mr_state { RXE_MR_STATE_INVALID, RXE_MR_STATE_FREE, RXE_MR_STATE_VALID, }; enum rxe_mr_copy_dir { RXE_TO_MR_OBJ, RXE_FROM_MR_OBJ, }; enum rxe_mr_lookup_type { RXE_LOOKUP_LOCAL, RXE_LOOKUP_REMOTE, }; enum rxe_rereg { RXE_MR_REREG_SUPPORTED = IB_MR_REREG_PD | IB_MR_REREG_ACCESS, }; static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX); } struct rxe_mr { struct rxe_pool_elem elem; struct ib_mr ibmr; struct ib_umem *umem; u32 lkey; u32 rkey; enum rxe_mr_state state; int access; atomic_t num_mw; unsigned int page_offset; unsigned int page_shift; u64 page_mask; u32 num_buf; u32 nbuf; struct xarray page_list; }; static inline unsigned int mr_page_size(struct rxe_mr *mr) { return mr ? mr->ibmr.page_size : PAGE_SIZE; } enum rxe_mw_state { RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID, RXE_MW_STATE_FREE = RXE_MR_STATE_FREE, RXE_MW_STATE_VALID = RXE_MR_STATE_VALID, }; struct rxe_mw { struct ib_mw ibmw; struct rxe_pool_elem elem; spinlock_t lock; enum rxe_mw_state state; struct rxe_qp *qp; /* Type 2 only */ struct rxe_mr *mr; u32 rkey; int access; u64 addr; u64 length; }; struct rxe_mcg { struct rb_node node; struct kref ref_cnt; struct rxe_dev *rxe; struct list_head qp_list; union ib_gid mgid; atomic_t qp_num; u32 qkey; u16 pkey; }; struct rxe_mca { struct list_head qp_list; struct rxe_qp *qp; }; struct rxe_port { struct ib_port_attr attr; __be64 port_guid; __be64 subnet_prefix; spinlock_t port_lock; /* guard port */ unsigned int mtu_cap; /* special QPs */ u32 qp_gsi_index; }; #define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; int max_ucontext; int max_inline_data; struct mutex usdev_lock; struct rxe_pool uc_pool; struct rxe_pool pd_pool; struct rxe_pool ah_pool; struct rxe_pool srq_pool; struct rxe_pool qp_pool; struct rxe_pool cq_pool; struct rxe_pool mr_pool; struct rxe_pool mw_pool; /* multicast support */ spinlock_t mcg_lock; struct rb_root mcg_tree; atomic_t mcg_num; atomic_t mcg_attach; spinlock_t pending_lock; /* guard pending_mmaps */ struct list_head pending_mmaps; spinlock_t mmap_offset_lock; /* guard mmap_offset */ u64 mmap_offset; atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; struct rxe_port port; struct crypto_shash *tfm; }; static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) { return ib_device_get_netdev(dev, RXE_PORT); } static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); } static inline struct rxe_dev *to_rdev(struct ib_device *dev) { return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL; } static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc) { return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL; } static inline struct rxe_pd *to_rpd(struct ib_pd *pd) { return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL; } static inline struct rxe_ah *to_rah(struct ib_ah *ah) { return ah ? container_of(ah, struct rxe_ah, ibah) : NULL; } static inline struct rxe_srq *to_rsrq(struct ib_srq *srq) { return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL; } static inline struct rxe_qp *to_rqp(struct ib_qp *qp) { return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL; } static inline struct rxe_cq *to_rcq(struct ib_cq *cq) { return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL; } static inline struct rxe_mr *to_rmr(struct ib_mr *mr) { return mr ? container_of(mr, struct rxe_mr, ibmr) : NULL; } static inline struct rxe_mw *to_rmw(struct ib_mw *mw) { return mw ? container_of(mw, struct rxe_mw, ibmw) : NULL; } static inline struct rxe_pd *rxe_ah_pd(struct rxe_ah *ah) { return to_rpd(ah->ibah.pd); } static inline struct rxe_pd *mr_pd(struct rxe_mr *mr) { return to_rpd(mr->ibmr.pd); } static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) { return to_rpd(mw->ibmw.pd); } int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, struct net_device *ndev); #endif /* RXE_VERBS_H */
164 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline unsigned long read_word_at_a_time(const void *addr) { kasan_check_read(addr, 1); return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_MTE_H #define __ASM_MTE_H #include <asm/compiler.h> #include <asm/mte-def.h> #ifndef __ASSEMBLY__ #include <linux/bitfield.h> #include <linux/kasan-enabled.h> #include <linux/page-flags.h> #include <linux/sched.h> #include <linux/types.h> #include <asm/pgtable-types.h> void mte_clear_page_tags(void *addr); unsigned long mte_copy_tags_from_user(void *to, const void __user *from, unsigned long n); unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); void *mte_allocate_tag_storage(void); void mte_free_tag_storage(char *storage); #ifdef CONFIG_ARM64_MTE /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 /* simple lock to avoid multiple threads tagging the same page */ #define PG_mte_lock PG_arch_3 static inline void set_page_mte_tagged(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * Ensure that the tags written prior to this function are visible * before the page flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &page->flags); } static inline bool page_mte_tagged(struct page *page) { bool ret = test_bit(PG_mte_tagged, &page->flags); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * If the page is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } /* * Lock the page for tagging and return 'true' if the page can be tagged, * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the * locking only happens once for page initialisation. * * The page MTE lock state: * * Locked: PG_mte_lock && !PG_mte_tagged * Unlocked: !PG_mte_lock || PG_mte_tagged * * Acquire semantics only if the page is tagged (returning 'false'). */ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); if (!test_and_set_bit(PG_mte_lock, &page->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); return false; } void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t pte, unsigned int nr_pages); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); void mte_cpu_setup(void); void mte_suspend_enter(void); void mte_suspend_exit(void); long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data); size_t mte_probe_user_range(const char __user *uaddr, size_t size); #else /* CONFIG_ARM64_MTE */ /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 static inline void set_page_mte_tagged(struct page *page) { } static inline bool page_mte_tagged(struct page *page) { return false; } static inline bool try_page_mte_tagging(struct page *page) { return false; } static inline void mte_zero_clear_page_tags(void *addr) { } static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) { } static inline void mte_thread_init_user(void) { } static inline void mte_thread_switch(struct task_struct *next) { } static inline void mte_suspend_enter(void) { } static inline void mte_suspend_exit(void) { } static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg) { return 0; } static inline long get_mte_ctrl(struct task_struct *task) { return 0; } static inline int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data) { return -EIO; } #endif /* CONFIG_ARM64_MTE */ #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE) static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * Ensure that the tags written prior to this function are visible * before the folio flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &folio->flags); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { bool ret = test_bit(PG_mte_tagged, &folio->flags); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * If the folio is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); if (!test_and_set_bit(PG_mte_lock, &folio->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); return false; } #else static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { return false; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { return false; } #endif static inline void mte_disable_tco_entry(struct task_struct *task) { if (!system_supports_mte()) return; /* * Re-enable tag checking (TCO set on exception entry). This is only * necessary if MTE is enabled in either the kernel or the userspace * task in synchronous or asymmetric mode (SCTLR_EL1.TCF0 bit 0 is set * for both). With MTE disabled in the kernel and disabled or * asynchronous in userspace, tag check faults (including in uaccesses) * are not reported, therefore there is no need to re-enable checking. * This is beneficial on microarchitectures where re-enabling TCO is * expensive. */ if (kasan_hw_tags_enabled() || (task->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))) asm volatile(SET_PSTATE_TCO(0)); } #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { if (!kasan_hw_tags_enabled()) return; mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { if (!kasan_hw_tags_enabled()) return; /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() * is required. */ dsb(nsh); isb(); mte_check_tfsr_el1(); } #else static inline void mte_check_tfsr_el1(void) { } static inline void mte_check_tfsr_entry(void) { } static inline void mte_check_tfsr_exit(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* __ASSEMBLY__ */ #endif /* __ASM_MTE_H */
86 85 36 118 68 63 1 53 23 57 11 79 322 124 123 78 80 75 61 72 62 60 69 63 60 63 3 76 3 60 76 86 75 89 68 80 89 89 84 89 2 40 33 13 216 13 8 314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_HOST_H #define __KVM_HOST_H #include <linux/types.h> #include <linux/hardirq.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/bug.h> #include <linux/minmax.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> #include <linux/preempt.h> #include <linux/msi.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/rcupdate.h> #include <linux/ratelimit.h> #include <linux/err.h> #include <linux/irqflags.h> #include <linux/context_tracking.h> #include <linux/irqbypass.h> #include <linux/rcuwait.h> #include <linux/refcount.h> #include <linux/nospec.h> #include <linux/notifier.h> #include <linux/ftrace.h> #include <linux/hashtable.h> #include <linux/instrumentation.h> #include <linux/interval_tree.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <asm/signal.h> #include <linux/kvm.h> #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <asm/kvm_host.h> #include <linux/kvm_dirty_ring.h> #ifndef KVM_MAX_VCPU_IDS #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * used in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) /* * Bit 63 of the memslot generation number is an "update in-progress flag", * e.g. is temporarily set for the duration of kvm_swap_active_memslots(). * This flag effectively creates a unique generation number that is used to * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, * i.e. may (or may not) have come from the previous memslots generation. * * This is necessary because the actual memslots update is not atomic with * respect to the generation number update. Updating the generation number * first would allow a vCPU to cache a spte from the old memslots using the * new generation number, and updating the generation number after switching * to the new memslots would allow cache hits using the old generation number * to reference the defunct memslots. * * This mechanism is used to prevent getting hits in KVM's caches while a * memslot update is in-progress, and to prevent cache hits *after* updating * the actual generation number against accesses that were inserted into the * cache *before* the memslots were updated. */ #define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(63) /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 #ifndef KVM_MAX_NR_ADDRESS_SPACES #define KVM_MAX_NR_ADDRESS_SPACES 1 #endif /* * For the normal pfn, the highest 12 bits should be zero, * so we can mask bit 62 ~ bit 52 to indicate the error pfn, * mask bit 63 to indicate the noslot pfn. */ #define KVM_PFN_ERR_MASK (0x7ffULL << 52) #define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) #define KVM_PFN_NOSLOT (0x1ULL << 63) #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) #define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to * translate it to pfn on host. */ static inline bool is_error_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_MASK); } /* * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted * by a pending signal. Note, the signal may or may not be fatal. */ static inline bool is_sigpending_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_ERR_SIGPENDING; } /* * error_noslot pfns indicate that the gfn can not be * translated to pfn - it is not in slot or failed to * translate it to pfn. */ static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); } /* noslot pfn indicates that the gfn is not in slot. */ static inline bool is_noslot_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_NOSLOT; } /* * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390) * provide own defines and kvm_is_error_hva */ #ifndef KVM_HVA_ERR_BAD #define KVM_HVA_ERR_BAD (PAGE_OFFSET) #define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) static inline bool kvm_is_error_hva(unsigned long addr) { return addr >= PAGE_OFFSET; } #endif static inline bool kvm_is_error_gpa(gpa_t gpa) { return gpa == INVALID_GPA; } #define KVM_REQUEST_MASK GENMASK(7,0) #define KVM_REQUEST_NO_WAKEUP BIT(8) #define KVM_REQUEST_WAIT BIT(9) #define KVM_REQUEST_NO_ACTION BIT(10) /* * Architecture-independent vcpu->requests bit members * Bits 3-7 are reserved for more arch-independent bits. */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_DIRTY_RING_SOFT_FULL 3 #define KVM_REQUEST_ARCH_BASE 8 /* * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to * OUTSIDE_GUEST_MODE. KVM_REQ_OUTSIDE_GUEST_MODE differs from a vCPU "kick" * in that it ensures the vCPU has reached OUTSIDE_GUEST_MODE before continuing * on. A kick only guarantees that the vCPU is on its way out, e.g. a previous * kick may have set vcpu->mode to EXITING_GUEST_MODE, and so there's no * guarantee the vCPU received an IPI and has actually exited guest mode. */ #define KVM_REQ_OUTSIDE_GUEST_MODE (KVM_REQUEST_NO_ACTION | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ BUILD_BUG_ON((unsigned)(nr) >= (sizeof_field(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \ (unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \ }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 extern struct mutex kvm_lock; extern struct list_head vm_list; struct kvm_io_range { gpa_t addr; int len; struct kvm_io_device *dev; }; #define NR_IOBUS_DEVS 1000 struct kvm_io_bus { int dev_count; int ioeventfd_count; struct kvm_io_range range[]; }; enum kvm_bus { KVM_MMIO_BUS, KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_FAST_MMIO_BUS, KVM_IOCSR_BUS, KVM_NR_BUSES }; int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie); int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); #ifdef CONFIG_KVM_ASYNC_PF struct kvm_async_pf { struct work_struct work; struct list_head link; struct list_head queue; struct kvm_vcpu *vcpu; gpa_t cr2_or_gpa; unsigned long addr; struct kvm_arch_async_pf arch; bool wakeup_all; bool notpresent_injected; }; void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { unsigned long attributes; }; enum kvm_gfn_range_filter { KVM_FILTER_SHARED = BIT(0), KVM_FILTER_PRIVATE = BIT(1), }; struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); #endif enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, EXITING_GUEST_MODE, READING_SHADOW_PAGE_TABLES, }; struct kvm_host_map { /* * Only valid if the 'pfn' is managed by the host kernel (i.e. There is * a 'struct page' for it. When using mem= kernel parameter some memory * can be used as guest memory but they are not managed by host * kernel). */ struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; bool writable; }; /* * Used to check if the mapping is valid or not. Never use 'kvm_host_map' * directly to check for that. */ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) { return !!map->hva; } static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) { return single_task_running() && !need_resched() && ktime_before(cur, stop); } /* * Sometimes a large or cross-page mmio needs to be broken up into separate * exits for userspace servicing. */ struct kvm_mmio_fragment { gpa_t gpa; void *data; unsigned len; }; struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier preempt_notifier; #endif int cpu; int vcpu_id; /* id given by userspace at creation */ int vcpu_idx; /* index into kvm->vcpu_array */ int ____srcu_idx; /* Don't use this directly. You've been warned. */ #ifdef CONFIG_PROVE_RCU int srcu_depth; #endif int mode; u64 requests; unsigned long guest_debug; struct mutex mutex; struct kvm_run *run; #ifndef __KVM_HAVE_ARCH_WQP struct rcuwait wait; #endif struct pid *pid; rwlock_t pid_lock; int sigset_active; sigset_t sigset; unsigned int halt_poll_ns; bool valid_wakeup; #ifdef CONFIG_HAS_IOMEM int mmio_needed; int mmio_read_completed; int mmio_is_write; int mmio_cur_fragment; int mmio_nr_fragments; struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS]; #endif #ifdef CONFIG_KVM_ASYNC_PF struct { u32 queued; struct list_head queue; struct list_head done; spinlock_t lock; } async_pf; #endif #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT /* * Cpu relax intercept or pause loop exit optimization * in_spin_loop: set when a vcpu does a pause loop exit * or cpu relax intercepted. * dy_eligible: indicates whether vcpu is eligible for directed yield. */ struct { bool in_spin_loop; bool dy_eligible; } spin_loop; #endif bool wants_to_run; bool preempted; bool ready; bool scheduled_out; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; /* * The most recently used memslot by this vCPU and the slots generation * for which it is valid. * No wraparound protection is needed since generations won't overflow in * thousands of years, even assuming 1M memslot operations per second. */ struct kvm_memory_slot *last_used_slot; u64 last_used_slot_gen; }; /* * Start accounting time towards a guest. * Must be called before entering guest context. */ static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the * stime pending cputime to flush. */ instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); } /* * Enter guest context and enter an RCU extended quiescent state. * * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is * unsafe to use any code which may directly or indirectly use RCU, tracing * (including IRQ flag tracing), or lockdep. All code in this period must be * non-instrumentable. */ static __always_inline void guest_context_enter_irqoff(void) { /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ if (!context_tracking_guest_enter()) { instrumentation_begin(); rcu_virt_note_context_switch(); instrumentation_end(); } } /* * Deprecated. Architectures should move to guest_timing_enter_irqoff() and * guest_state_enter_irqoff(). */ static __always_inline void guest_enter_irqoff(void) { guest_timing_enter_irqoff(); guest_context_enter_irqoff(); } /** * guest_state_enter_irqoff - Fixup state when entering a guest * * Entry to a guest will enable interrupts, but the kernel state is interrupts * disabled when this is invoked. Also tell RCU about it. * * 1) Trace interrupts on state * 2) Invoke context tracking if enabled to adjust RCU state * 3) Tell lockdep that interrupts are enabled * * Invoked from architecture specific code before entering a guest. * Must be called with interrupts disabled and the caller must be * non-instrumentable. * The caller has to invoke guest_timing_enter_irqoff() before this. * * Note: this is analogous to exit_to_user_mode(). */ static __always_inline void guest_state_enter_irqoff(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); guest_context_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); } /* * Exit guest context and exit an RCU extended quiescent state. * * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is * unsafe to use any code which may directly or indirectly use RCU, tracing * (including IRQ flag tracing), or lockdep. All code in this period must be * non-instrumentable. */ static __always_inline void guest_context_exit_irqoff(void) { /* * Guest mode is treated as a quiescent state, see * guest_context_enter_irqoff() for more details. */ if (!context_tracking_guest_exit()) { instrumentation_begin(); rcu_virt_note_context_switch(); instrumentation_end(); } } /* * Stop accounting time towards a guest. * Must be called after exiting guest context. */ static __always_inline void guest_timing_exit_irqoff(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } /* * Deprecated. Architectures should move to guest_state_exit_irqoff() and * guest_timing_exit_irqoff(). */ static __always_inline void guest_exit_irqoff(void) { guest_context_exit_irqoff(); guest_timing_exit_irqoff(); } static inline void guest_exit(void) { unsigned long flags; local_irq_save(flags); guest_exit_irqoff(); local_irq_restore(flags); } /** * guest_state_exit_irqoff - Establish state when returning from guest mode * * Entry from a guest disables interrupts, but guest mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. * * 1) Tell lockdep that interrupts are disabled * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state * * Invoked from architecture specific code after exiting a guest. * Must be invoked with interrupts disabled and the caller must be * non-instrumentable. * The caller has to invoke guest_timing_exit_irqoff() after this. * * Note: this is analogous to enter_from_user_mode(). */ static __always_inline void guest_state_exit_irqoff(void) { lockdep_hardirqs_off(CALLER_ADDR0); guest_context_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); instrumentation_end(); } static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* * The memory barrier ensures a previous write to vcpu->requests cannot * be reordered with the read of vcpu->mode. It pairs with the general * memory barrier following the write of vcpu->mode in VCPU RUN. */ smp_mb__before_atomic(); return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); } /* * Some of the bitops functions do not support too long bitmaps. * This number must be determined not to exceed such limits. */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) /* * Since at idle each memslot belongs to two memslot sets it has to contain * two embedded nodes for each data structure that it forms a part of. * * Two memslot sets (one active and one inactive) are necessary so the VM * continues to run on one memslot set while the other is being modified. * * These two memslot sets normally point to the same set of memslots. * They can, however, be desynchronized when performing a memslot management * operation by replacing the memslot to be modified by its copy. * After the operation is complete, both memslot sets once again point to * the same, common set of memslot data. * * The memslots themselves are independent of each other so they can be * individually added or deleted. */ struct kvm_memory_slot { struct hlist_node id_node[2]; struct interval_tree_node hva_node[2]; struct rb_node gfn_node[2]; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; u32 flags; short id; u16 as_id; #ifdef CONFIG_KVM_PRIVATE_MEM struct { /* * Writes protected by kvm->slots_lock. Acquiring a * reference via kvm_gmem_get_file() is protected by * either kvm->slots_lock or kvm->srcu. */ struct file *file; pgoff_t pgoff; } gmem; #endif }; static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) { return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); } static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; } static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) { return ALIGN(memslot->npages, BITS_PER_LONG) / 8; } static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long len = kvm_dirty_bitmap_bytes(memslot); return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); } #ifndef KVM_DIRTY_LOG_MANUAL_CAPS #define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE #endif struct kvm_s390_adapter_int { u64 ind_addr; u64 summary_addr; u64 ind_offset; u32 summary_offset; u32 adapter_id; }; struct kvm_hv_sint { u32 vcpu; u32 sint; }; struct kvm_xen_evtchn { u32 port; u32 vcpu_id; int vcpu_idx; u32 priority; }; struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); union { struct { unsigned irqchip; unsigned pin; } irqchip; struct { u32 address_lo; u32 address_hi; u32 data; u32 flags; u32 devid; } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING struct kvm_irq_routing_table { int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; u32 nr_rt_entries; /* * Array indexed by gsi. Each entry contains list of irq chips * the gsi is connected to. */ struct hlist_head map[] __counted_by(nr_rt_entries); }; #endif bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #ifndef KVM_INTERNAL_MEM_SLOTS #define KVM_INTERNAL_MEM_SLOTS 0 #endif #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) #if KVM_MAX_NR_ADDRESS_SPACES == 1 static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) { return KVM_MAX_NR_ADDRESS_SPACES; } static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; } #endif /* * Arch code must define kvm_arch_has_private_mem if support for private memory * is enabled. */ #if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; } #endif #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM); } #endif struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; struct rb_root_cached hva_tree; struct rb_root gfn_tree; /* * The mapping table from slot id to memslot. * * 7-bit bucket count matches the size of the old id to index array for * 512 slots, while giving good performance with this slot count. * Higher bucket counts bring only small performance improvements but * always result in higher memory usage (even for lower memslot counts). */ DECLARE_HASHTABLE(id_hash, 7); int node_idx; }; struct kvm { #ifdef KVM_HAVE_MMU_RWLOCK rwlock_t mmu_lock; #else spinlock_t mmu_lock; #endif /* KVM_HAVE_MMU_RWLOCK */ struct mutex slots_lock; /* * Protects the arch-specific fields of struct kvm_memory_slots in * use by the VM. To be used under the slots_lock (above) or in a * kvm->srcu critical section where acquiring the slots_lock would * lead to deadlock with the synchronize_srcu in * kvm_swap_active_memslots(). */ struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; /* The two memslot sets - active and inactive (per address space) */ struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2]; /* The current active memslot set for each address space */ struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES]; struct xarray vcpu_array; /* * Protected by slots_lock, but can be read outside if an * incorrect answer is acceptable. */ atomic_t nr_memslots_dirty_logging; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; unsigned long mn_active_invalidate_count; struct rcuwait mn_memslots_update_rcuwait; /* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; struct list_head gpc_list; /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only * incremented after storing the kvm_vcpu pointer in vcpus, * and is accessed atomically. */ atomic_t online_vcpus; int max_vcpus; int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; #ifdef CONFIG_HAVE_KVM_IRQCHIP struct { spinlock_t lock; struct list_head items; /* resampler_list update side is protected by resampler_lock. */ struct list_head resampler_list; struct mutex resampler_lock; } irqfds; #endif struct list_head ioeventfds; struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; #ifdef CONFIG_KVM_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; struct list_head coalesced_zones; #endif struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP /* * Update side is protected by irq_lock. */ struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head irq_ack_notifier_list; #endif #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; gfn_t mmu_invalidate_range_start; gfn_t mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; bool override_halt_poll_ns; unsigned int max_halt_poll_ns; u32 dirty_ring_size; bool dirty_ring_with_bitmap; bool vm_bugged; bool vm_dead; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; #endif #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES /* Protected by slots_locks (for writes) and RCU (for reads) */ struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; }; #define kvm_err(fmt, ...) \ pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_info(fmt, ...) \ pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_debug(fmt, ...) \ pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_debug_ratelimited(fmt, ...) \ pr_debug_ratelimited("kvm [%i]: " fmt, task_pid_nr(current), \ ## __VA_ARGS__) #define kvm_pr_unimpl(fmt, ...) \ pr_err_ratelimited("kvm [%i]: " fmt, \ task_tgid_nr(current), ## __VA_ARGS__) /* The guest did something we don't support. */ #define vcpu_unimpl(vcpu, fmt, ...) \ kvm_pr_unimpl("vcpu%i, guest rIP: 0x%lx " fmt, \ (vcpu)->vcpu_id, kvm_rip_read(vcpu), ## __VA_ARGS__) #define vcpu_debug(vcpu, fmt, ...) \ kvm_debug("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) #define vcpu_debug_ratelimited(vcpu, fmt, ...) \ kvm_debug_ratelimited("vcpu%i " fmt, (vcpu)->vcpu_id, \ ## __VA_ARGS__) #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) static inline void kvm_vm_dead(struct kvm *kvm) { kvm->vm_dead = true; kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); } static inline void kvm_vm_bugged(struct kvm *kvm) { kvm->vm_bugged = true; kvm_vm_dead(kvm); } #define KVM_BUG(cond, kvm, fmt...) \ ({ \ bool __ret = !!(cond); \ \ if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) #define KVM_BUG_ON(cond, kvm) \ ({ \ bool __ret = !!(cond); \ \ if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) /* * Note, "data corruption" refers to corruption of host kernel data structures, * not guest data. Guest data corruption, suspected or confirmed, that is tied * and contained to a single VM should *never* BUG() and potentially panic the * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure * is corrupted and that corruption can have a cascading effect to other parts * of the hosts and/or to other VMs. */ #define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm) \ ({ \ bool __ret = !!(cond); \ \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) \ BUG_ON(__ret); \ else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PROVE_RCU WARN_ONCE(vcpu->srcu_depth++, "KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1); #endif vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); } static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu) { srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx); #ifdef CONFIG_PROVE_RCU WARN_ONCE(--vcpu->srcu_depth, "KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth); #endif } static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); } static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { int num_vcpus = atomic_read(&kvm->online_vcpus); /* * Explicitly verify the target vCPU is online, as the anti-speculation * logic only limits the CPU's ability to speculate, e.g. given a "bad" * index, clamping the index to 0 would return vCPU0, not NULL. */ if (i >= num_vcpus) return NULL; i = array_index_nospec(i, num_vcpus); /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return xa_load(&kvm->vcpu_array, i); } #define kvm_for_each_vcpu(idx, vcpup, kvm) \ if (atomic_read(&kvm->online_vcpus)) \ xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (id < 0) return NULL; if (id < KVM_MAX_VCPUS) vcpu = kvm_get_vcpu(kvm, id); if (vcpu && vcpu->vcpu_id == id) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (vcpu->vcpu_id == id) return vcpu; return NULL; } void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) { } #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd_init(void); void kvm_irqfd_exit(void); #else static inline int kvm_irqfd_init(void) { return 0; } static inline void kvm_irqfd_exit(void) { } #endif int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) { return __kvm_memslots(kvm, 0); } static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) { int as_id = kvm_arch_vcpu_memslots_id(vcpu); return __kvm_memslots(vcpu->kvm, as_id); } static inline bool kvm_memslots_empty(struct kvm_memslots *slots) { return RB_EMPTY_ROOT(&slots->gfn_tree); } bool kvm_are_all_memslots_empty(struct kvm *kvm); #define kvm_for_each_memslot(memslot, bkt, slots) \ hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ if (WARN_ON_ONCE(!memslot->npages)) { \ } else static inline struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { struct kvm_memory_slot *slot; int idx = slots->node_idx; hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) { if (slot->id == id) return slot; } return NULL; } /* Iterator used for walking memslots that overlap a gfn range. */ struct kvm_memslot_iter { struct kvm_memslots *slots; struct rb_node *node; struct kvm_memory_slot *slot; }; static inline void kvm_memslot_iter_next(struct kvm_memslot_iter *iter) { iter->node = rb_next(iter->node); if (!iter->node) return; iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[iter->slots->node_idx]); } static inline void kvm_memslot_iter_start(struct kvm_memslot_iter *iter, struct kvm_memslots *slots, gfn_t start) { int idx = slots->node_idx; struct rb_node *tmp; struct kvm_memory_slot *slot; iter->slots = slots; /* * Find the so called "upper bound" of a key - the first node that has * its key strictly greater than the searched one (the start gfn in our case). */ iter->node = NULL; for (tmp = slots->gfn_tree.rb_node; tmp; ) { slot = container_of(tmp, struct kvm_memory_slot, gfn_node[idx]); if (start < slot->base_gfn) { iter->node = tmp; tmp = tmp->rb_left; } else { tmp = tmp->rb_right; } } /* * Find the slot with the lowest gfn that can possibly intersect with * the range, so we'll ideally have slot start <= range start */ if (iter->node) { /* * A NULL previous node means that the very first slot * already has a higher start gfn. * In this case slot start > range start. */ tmp = rb_prev(iter->node); if (tmp) iter->node = tmp; } else { /* a NULL node below means no slots */ iter->node = rb_last(&slots->gfn_tree); } if (iter->node) { iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[idx]); /* * It is possible in the slot start < range start case that the * found slot ends before or at range start (slot end <= range start) * and so it does not overlap the requested range. * * In such non-overlapping case the next slot (if it exists) will * already have slot start > range start, otherwise the logic above * would have found it instead of the current slot. */ if (iter->slot->base_gfn + iter->slot->npages <= start) kvm_memslot_iter_next(iter); } } static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_t end) { if (!iter->node) return false; /* * If this slot starts beyond or at the end of the range so does * every next one */ return iter->slot->base_gfn < end; } /* Iterate over each memslot at least partially intersecting [start, end) range */ #define kvm_for_each_memslot_in_gfn_range(iter, slots, start, end) \ for (kvm_memslot_iter_start(iter, slots, start); \ kvm_memslot_iter_is_valid(iter, end); \ kvm_memslot_iter_next(iter)) struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); /* * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot * - delete an existing memory slot * - modify an existing memory slot * -- move it in the guest physical memory space * -- just change its flags * * Since flags can be changed by some of these operations, the following * differentiation is the best we can do for kvm_set_memory_region(): */ enum kvm_mr_change { KVM_MR_CREATE, KVM_MR_DELETE, KVM_MR_MOVE, KVM_MR_FLAGS_ONLY, }; int kvm_set_internal_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); /* flush all memory translations */ void kvm_arch_flush_shadow_all(struct kvm *kvm); /* flush memory translations pointing to 'slot' */ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write); static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { return __gfn_to_page(kvm, gfn, true); } unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); static inline void kvm_release_page_unused(struct page *page) { if (!page) return; put_page(page); } void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page, bool unused, bool dirty) { lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused); if (!page) return; /* * If the page that KVM got from the *primary MMU* is writable, and KVM * installed or reused a SPTE, mark the page/folio dirty. Note, this * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if * the GFN is write-protected. Folios can't be safely marked dirty * outside of mmu_lock as doing so could race with writeback on the * folio. As a result, KVM can't mark folios dirty in the fast page * fault handler, and so KVM must (somewhat) speculatively mark the * folio dirty if KVM could locklessly make the SPTE writable. */ if (unused) kvm_release_page_unused(page); else if (dirty) kvm_release_page_dirty(page); else kvm_release_page_clean(page); } kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page); static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool write, bool *writable, struct page **refcounted_page) { return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); #define __kvm_get_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ __ret = get_user(v, __uaddr); \ __ret; \ }) #define kvm_get_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ \ __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \ offset_in_page(__gpa), v); \ }) #define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ __ret = put_user(v, __uaddr); \ if (!__ret) \ mark_page_dirty(kvm, gfn); \ __ret; \ }) #define kvm_put_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ \ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ offset_in_page(__gpa), v); \ }) int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map) { return __kvm_vcpu_map(vcpu, gpa, map, true); } static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map) { return __kvm_vcpu_map(vcpu, gpa, map, false); } unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len); int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** * kvm_gpc_init - initialize gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @kvm: pointer to kvm instance. * * This sets up a gfn_to_pfn_cache by initializing locks and assigning the * immutable attributes. Note, the cache must be zero-allocated (or zeroed by * the caller before init). */ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); /** * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest * physical address. * * @gpc: struct gfn_to_pfn_cache object. * @gpa: guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for * invalidations to be processed. Callers are required to use kvm_gpc_check() * to ensure that the cache is valid before accessing the target page. */ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA. * * @gpc: struct gfn_to_pfn_cache object. * @hva: userspace virtual address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * The semantics of this function are the same as those of kvm_gpc_activate(). It * merely bypasses a layer of address translation. */ int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len); /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @len: sanity check; the range being access must fit a single page. * * @return: %true if the cache is still valid and the address matches. * %false if the cache is not valid. * * Callers outside IN_GUEST_MODE context should hold a read lock on @gpc->lock * while calling this function, and then continue to hold the lock until the * access is complete. * * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * * @gpc: struct gfn_to_pfn_cache object. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful * return from this function does not mean the page can be immediately * accessed because it may have raced with an invalidation. Callers must * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * * This removes a cache from the VM's list to be processed on MMU notifier * invocation. */ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc) { return gpc->active && !kvm_is_error_gpa(gpc->gpa); } static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) { return gpc->active && kvm_is_error_gpa(gpc->gpa); } void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); void kvm_vcpu_halt(struct kvm_vcpu *vcpu); bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot); #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min); int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc); void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif void kvm_mmu_invalidate_begin(struct kvm *kvm); void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_mmu_invalidate_end(struct kvm *kvm); bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot); #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot); #endif int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr); int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state); #endif #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry); #else static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING /* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of * kvm_usage_count, i.e. at the beginning of the generic hardware enabling * sequence, and at the end of the generic hardware disabling sequence. */ void kvm_arch_enable_virtualization(void); void kvm_arch_disable_virtualization(void); /* * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to * do the actual twiddling of hardware bits. The hooks are called on all * online CPUs when KVM enables/disabled virtualization, and on a single CPU * when that CPU is onlined/offlined (including for Resume/Suspend). */ int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* * All architectures that want to use vzalloc currently also * need their own kvm_arch_alloc_vm implementation. */ static inline struct kvm *kvm_arch_alloc_vm(void) { return kzalloc(sizeof(struct kvm), GFP_KERNEL_ACCOUNT); } #endif static inline void __kvm_arch_free_vm(struct kvm *kvm) { kvfree(kvm); } #ifndef __KVM_HAVE_ARCH_VM_FREE static inline void kvm_arch_free_vm(struct kvm *kvm) { __kvm_arch_free_vm(kvm); } #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { return -ENOTSUPP; } #else int kvm_arch_flush_remote_tlbs(struct kvm *kvm); #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { return -EOPNOTSUPP; } #else int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); #endif #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA void kvm_arch_register_noncoherent_dma(struct kvm *kvm); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm); #else static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { } static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { } static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return false; } #endif #ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE void kvm_arch_start_assignment(struct kvm *kvm); void kvm_arch_end_assignment(struct kvm *kvm); bool kvm_arch_has_assigned_device(struct kvm *kvm); #else static inline void kvm_arch_start_assignment(struct kvm *kvm) { } static inline void kvm_arch_end_assignment(struct kvm *kvm) { } static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) { return false; } #endif static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP return vcpu->arch.waitp; #else return &vcpu->wait; #endif } /* * Wake a vCPU if necessary, but don't do any stats/metadata updates. Returns * true if the vCPU was blocking and was awakened, false otherwise. */ static inline bool __kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { return !!rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); } static inline bool kvm_vcpu_is_blocking(struct kvm_vcpu *vcpu) { return rcuwait_active(kvm_arch_vcpu_get_wait(vcpu)); } #ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED /* * returns true if the virtual interrupt controller is initialized and * ready to accept virtual IRQ. On some architectures the virtual interrupt * controller is dynamically instantiated and this is not always true. */ bool kvm_arch_intc_initialized(struct kvm *kvm); #else static inline bool kvm_arch_intc_initialized(struct kvm *kvm) { return true; } #endif #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); void kvm_unregister_perf_callbacks(void); #else static inline void kvm_register_perf_callbacks(void *ign) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi); int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* * Returns a pointer to the memslot if it contains gfn. * Otherwise returns NULL. */ static inline struct kvm_memory_slot * try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { if (!slot) return NULL; if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) return slot; else return NULL; } /* * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL. * * With "approx" set returns the memslot also when the address falls * in a hole. In that case one of the memslots bordering the hole is * returned. */ static inline struct kvm_memory_slot * search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; struct rb_node *node; int idx = slots->node_idx; slot = NULL; for (node = slots->gfn_tree.rb_node; node; ) { slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]); if (gfn >= slot->base_gfn) { if (gfn < slot->base_gfn + slot->npages) return slot; node = node->rb_right; } else node = node->rb_left; } return approx ? slot : NULL; } static inline struct kvm_memory_slot * ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot); slot = try_get_memslot(slot, gfn); if (slot) return slot; slot = search_memslots(slots, gfn, approx); if (slot) { atomic_long_set(&slots->last_used_slot, (unsigned long)slot); return slot; } return NULL; } /* * __gfn_to_memslot() and its descendants are here to allow arch code to inline * the lookups in hot paths. gfn_to_memslot() itself isn't here as an inline * because that would bloat other code too much. */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { return ____gfn_to_memslot(slots, gfn, false); } static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { /* * The index was checked originally in search_memslots. To avoid * that a malicious guest builds a Spectre gadget out of e.g. page * table walks, do not let the processor speculate loads outside * the guest's registered memslots. */ unsigned long offset = gfn - slot->base_gfn; offset = array_index_nospec(offset, slot->npages); return slot->userspace_addr + offset * PAGE_SIZE; } static inline int memslot_id(struct kvm *kvm, gfn_t gfn) { return gfn_to_memslot(kvm, gfn)->id; } static inline gfn_t hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) { gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT; return slot->base_gfn + gfn_offset; } static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; } static inline gfn_t gpa_to_gfn(gpa_t gpa) { return (gfn_t)(gpa >> PAGE_SHIFT); } static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; } static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) { unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); return !kvm_is_error_hva(hva); } static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) { lockdep_assert_held(&gpc->lock); if (!gpc->memslot) return; mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); } enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, }; struct kvm_stat_data { struct kvm *kvm; const struct _kvm_stats_desc *desc; enum kvm_stat_kind kind; }; struct _kvm_stats_desc { struct kvm_stats_desc desc; char name[KVM_STATS_NAME_SIZE]; }; #define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ .size = sz, \ .bucket_size = bsz #define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } #define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } #define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } #define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ #define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ unit, base, exponent, 1, 0) #define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ unit, base, exponent, sz, bsz) #define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Instantaneous counter, read only */ #define STATS_DESC_ICOUNTER(SCOPE, name) \ STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Peak counter, read/write */ #define STATS_DESC_PCOUNTER(SCOPE, name) \ STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Instantaneous boolean value, read only */ #define STATS_DESC_IBOOLEAN(SCOPE, name) \ STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ KVM_STATS_BASE_POW10, 0) /* Peak (sticky) boolean value, read/write */ #define STATS_DESC_PBOOLEAN(SCOPE, name) \ STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ KVM_STATS_BASE_POW10, 0) /* Cumulative time in nanosecond */ #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) /* Linear histogram for time in nanosecond */ #define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9, sz, bsz) /* Logarithmic histogram for time in nanosecond */ #define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_attempted_poll), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); /** * kvm_stats_linear_hist_update() - Update bucket value for linear histogram * statistics data. * * @data: start address of the stats data * @size: the number of bucket of the stats data * @value: the new value used to update the linear histogram's bucket * @bucket_size: the size (width) of a bucket */ static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, u64 value, size_t bucket_size) { size_t index = div64_u64(value, bucket_size); index = min(index, size - 1); ++data[index]; } /** * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram * statistics data. * * @data: start address of the stats data * @size: the number of bucket of the stats data * @value: the new value used to update the logarithmic histogram's bucket */ static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) { size_t index = fls64(value); index = min(index, size - 1); ++data[index]; } #define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) #define KVM_STATS_LOG_HIST_UPDATE(array, value) \ kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) return 1; /* * Ensure the read of mmu_invalidate_in_progress happens before * the read of mmu_invalidate_seq. This interacts with the * smp_wmb() in mmu_notifier_invalidate_range_end to make sure * that the caller either sees the old (non-zero) value of * mmu_invalidate_in_progress or the new (incremented) value of * mmu_invalidate_seq. * * PowerPC Book3s HV KVM calls this under a per-page lock rather * than under kvm->mmu_lock, for scalability, so can't rely on * kvm->mmu_lock to keep things ordered. */ smp_rmb(); if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, unsigned long mmu_seq, gfn_t gfn) { lockdep_assert_held(&kvm->mmu_lock); /* * If mmu_invalidate_in_progress is non-zero, then the range maintained * by kvm_mmu_notifier_invalidate_range_start contains all addresses * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ if (unlikely(kvm->mmu_invalidate_in_progress)) { /* * Dropping mmu_lock after bumping mmu_invalidate_in_progress * but before updating the range is a KVM bug. */ if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA || kvm->mmu_invalidate_range_end == INVALID_GPA)) return 1; if (gfn >= kvm->mmu_invalidate_range_start && gfn < kvm->mmu_invalidate_range_end) return 1; } if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } /* * This lockless version of the range-based retry check *must* be paired with a * call to the locked version after acquiring mmu_lock, i.e. this is safe to * use only as a pre-check to avoid contending mmu_lock. This version *will* * get false negatives and false positives. */ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, unsigned long mmu_seq, gfn_t gfn) { /* * Use READ_ONCE() to ensure the in-progress flag and sequence counter * are always read from memory, e.g. so that checking for retry in a * loop won't result in an infinite retry loop. Don't force loads for * start+end, as the key to avoiding infinite retry loops is observing * the 1=>0 transition of in-progress, i.e. getting false negatives * due to stale start+end values is acceptable. */ if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && gfn >= kvm->mmu_invalidate_range_start && gfn < kvm->mmu_invalidate_range_end) return true; return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; } #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */ bool kvm_arch_can_set_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); int kvm_init_irq_routing(struct kvm *kvm); int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); #else static inline void kvm_free_irq_routing(struct kvm *kvm) {} static inline int kvm_init_irq_routing(struct kvm *kvm) { return 0; } #endif int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); #ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin); void kvm_irq_routing_update(struct kvm *); #else static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { return -EINVAL; } static inline void kvm_irqfd_release(struct kvm *kvm) {} static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin) { return false; } #endif /* CONFIG_HAVE_KVM_IRQCHIP */ void kvm_arch_irq_routing_update(struct kvm *kvm); static inline void __kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* * Ensure the rest of the request is published to kvm_check_request's * caller. Paired with the smp_mb__after_atomic in kvm_check_request. */ smp_wmb(); set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* * Request that don't require vCPU action should never be logged in * vcpu->requests. The vCPU won't clear the request, so it will stay * logged indefinitely and prevent the vCPU from entering the guest. */ BUILD_BUG_ON(!__builtin_constant_p(req) || (req & KVM_REQUEST_NO_ACTION)); __kvm_make_request(req, vcpu); } static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->requests); } static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) { return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) { clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { if (kvm_test_request(req, vcpu)) { kvm_clear_request(req, vcpu); /* * Ensure the rest of the request is visible to kvm_check_request's * caller. Paired with the smp_wmb in kvm_make_request. */ smp_mb__after_atomic(); return true; } else { return false; } } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; struct kvm_device { const struct kvm_device_ops *ops; struct kvm *kvm; void *private; struct list_head vm_node; }; /* create, destroy, and name are mandatory */ struct kvm_device_ops { const char *name; /* * create is called holding kvm->lock and any operations not suitable * to do while holding the lock should be deferred to init (see * below). */ int (*create)(struct kvm_device *dev, u32 type); /* * init is called after create if create is successful and is called * outside of holding kvm->lock. */ void (*init)(struct kvm_device *dev); /* * Destroy is responsible for freeing dev. * * Destroy may be called before or after destructors are called * on emulated I/O regions, depending on whether a reference is * held by a vcpu or other kvm component that gets destroyed * after the emulated I/O. */ void (*destroy)(struct kvm_device *dev); /* * Release is an alternative method to free the device. It is * called when the device file descriptor is closed. Once * release is called, the destroy method will not be called * anymore as the device is removed from the device list of * the VM. kvm->lock is held. */ void (*release)(struct kvm_device *dev); int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, unsigned long arg); int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); }; struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type); void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.in_spin_loop = val; } static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.dy_eligible = val; } #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { } static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) { return (memslot && memslot->id < KVM_USER_MEM_SLOTS && !(memslot->flags & KVM_MEMSLOT_INVALID)); } struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS /* If we wakeup during the poll time, was it a sucessful poll? */ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return vcpu->valid_wakeup; } #else static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return true; } #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ #ifdef CONFIG_HAVE_KVM_NO_POLL /* Callback that tells if we must not poll */ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu); #else static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { return false; } #endif /* CONFIG_HAVE_KVM_NO_POLL */ #ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); #else static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -ENOIOCTLCMD; } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { return 0; } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->stat.signal_exits++; } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ /* * If more than one page is being (un)accounted, @virt must be the address of * the first page of a block of pages what were allocated together (i.e * accounted together). * * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state() * is thread-safe. */ static inline void kvm_account_pgtable_pages(void *virt, int nr) { mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr); } /* * This defines how many reserved entries we want to keep before we * kick the vcpu to the userspace to avoid dirty ring full. This * value can be tuned to higher if e.g. PML is enabled on the host. */ #define KVM_DIRTY_RING_RSVD_ENTRIES 64 /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t size, bool is_write, bool is_exec, bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; vcpu->run->memory_fault.size = size; /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; if (is_private) vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) { return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)); } bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long mask, unsigned long attrs); bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { return false; } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; } #endif /* CONFIG_KVM_PRIVATE_MEM */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * * @kvm: KVM instance * @gfn: starting GFN to be populated * @src: userspace-provided buffer containing data to copy into GFN range * (passed to @post_populate, and incremented on each iteration * if not NULL) * @npages: number of pages to copy from userspace-buffer * @post_populate: callback to issue for each gmem page that backs the GPA * range * @opaque: opaque data to pass to @post_populate callback * * This is primarily intended for cases where a gmem-backed GPA range needs * to be initialized with userspace-provided data prior to being mapped into * the guest as a private page. This should be called with the slots->lock * held so that caller-enforced invariants regarding the expected memory * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. * * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); #endif #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif #endif
275 68 68 136 275 57 241 275 275 275 275 240 136 240 241 136 135 239 198 144 240 240 57 241 240 241 137 136 136 136 135 136 241 241 57 57 57 137 137 34 34 294 20 360 361 400 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment on MAX_NR_TIERS */ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); if (!(flags & BIT(PG_referenced))) return 0; /* * Return the total number of accesses including PG_referenced. Also see * the comment on LRU_REFS_FLAGS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ * | Accessed through page tables and | Accessed through file descriptors | * | promoted by folio_update_gen() | and protected by folio_inc_gen() | * +-----------------------------------+-----------------------------------+ * | PG_active (set while isolated) | | * +-----------------+-----------------+-----------------+-----------------+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | * +-----------------------------------+-----------------------------------+ * |<---------- MIN_NR_GENS ---------->| | * |<---------------------------- MAX_NR_GENS ---------------------------->| */ if (folio_test_active(folio)) gen = MIN_NR_GENS - folio_test_workingset(folio); else if (reclaiming) gen = MAX_NR_GENS; else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else gen = MAX_NR_GENS - folio_test_workingset(folio); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * Returns true if an uffd-wp pte was installed, false otherwise. */ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } #endif return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif
15 9 10 10 10 10 10 9 10 10 5 5 5 5 5 5 5 1 10 86 15 15 15 87 44 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 // SPDX-License-Identifier: GPL-2.0-only /* * Fault injection for both 32 and 64bit guests. * * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Based on arch/arm/kvm/emulate.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <hyp/adjust_pc.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__) #error Hypervisor code only! #endif static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val; if (unlikely(vcpu_has_nv(vcpu))) return vcpu_read_sys_reg(vcpu, reg); else if (__vcpu_read_sys_reg_from_cpu(reg, &val)) return val; return __vcpu_sys_reg(vcpu, reg); } static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { if (unlikely(vcpu_has_nv(vcpu))) vcpu_write_sys_reg(vcpu, val, reg); else if (!__vcpu_write_sys_reg_to_cpu(val, reg)) __vcpu_sys_reg(vcpu, reg) = val; } static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, u64 val) { if (unlikely(vcpu_has_nv(vcpu))) { if (target_mode == PSR_MODE_EL1h) vcpu_write_sys_reg(vcpu, val, SPSR_EL1); else vcpu_write_sys_reg(vcpu, val, SPSR_EL2); } else if (has_vhe()) { write_sysreg_el1(val, SYS_SPSR); } else { __vcpu_sys_reg(vcpu, SPSR_EL1) = val; } } static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) { if (has_vhe()) write_sysreg(val, spsr_abt); else vcpu->arch.ctxt.spsr_abt = val; } static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) { if (has_vhe()) write_sysreg(val, spsr_und); else vcpu->arch.ctxt.spsr_und = val; } /* * This performs the exception entry at a given EL (@target_mode), stashing PC * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. * The EL passed to this function *must* be a non-secure, privileged mode with * bit 0 being set (PSTATE.SP == 1). * * When an exception is taken, most PSTATE fields are left unchanged in the * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. * * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. * * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from * MSB to LSB. */ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, enum exception_type type) { unsigned long sctlr, vbar, old, new, mode; u64 exc_offset; mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); if (mode == target_mode) exc_offset = CURRENT_EL_SP_ELx_VECTOR; else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) exc_offset = CURRENT_EL_SP_EL0_VECTOR; else if (!(mode & PSR_MODE32_BIT)) exc_offset = LOWER_EL_AArch64_VECTOR; else exc_offset = LOWER_EL_AArch32_VECTOR; switch (target_mode) { case PSR_MODE_EL1h: vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1); sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); break; case PSR_MODE_EL2h: vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2); sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2); __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2); break; default: /* Don't do that */ BUG(); } *vcpu_pc(vcpu) = vbar + exc_offset + type; old = *vcpu_cpsr(vcpu); new = 0; new |= (old & PSR_N_BIT); new |= (old & PSR_Z_BIT); new |= (old & PSR_C_BIT); new |= (old & PSR_V_BIT); if (kvm_has_mte(kern_hyp_va(vcpu->kvm))) new |= PSR_TCO_BIT; new |= (old & PSR_DIT_BIT); // PSTATE.UAO is set to zero upon any exception to AArch64 // See ARM DDI 0487E.a, page D5-2579. // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented // See ARM DDI 0487E.a, page D5-2578. new |= (old & PSR_PAN_BIT); if (!(sctlr & SCTLR_EL1_SPAN)) new |= PSR_PAN_BIT; // PSTATE.SS is set to zero upon any exception to AArch64 // See ARM DDI 0487E.a, page D2-2452. // PSTATE.IL is set to zero upon any exception to AArch64 // See ARM DDI 0487E.a, page D1-2306. // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 // See ARM DDI 0487E.a, page D13-3258 if (sctlr & SCTLR_ELx_DSSBS) new |= PSR_SSBS_BIT; // PSTATE.BTYPE is set to zero upon any exception to AArch64 // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. new |= PSR_D_BIT; new |= PSR_A_BIT; new |= PSR_I_BIT; new |= PSR_F_BIT; new |= target_mode; *vcpu_cpsr(vcpu) = new; __vcpu_write_spsr(vcpu, target_mode, old); } /* * When an exception is taken, most CPSR fields are left unchanged in the * handler. However, some are explicitly overridden (e.g. M[4:0]). * * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was * obsoleted by the ARMv7 virtualization extensions and is RES0. * * For the SPSR layout seen from AArch32, see: * - ARM DDI 0406C.d, page B1-1148 * - ARM DDI 0487E.a, page G8-6264 * * For the SPSR_ELx layout for AArch32 seen from AArch64, see: * - ARM DDI 0487E.a, page C5-426 * * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from * MSB to LSB. */ static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) { u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); unsigned long old, new; old = *vcpu_cpsr(vcpu); new = 0; new |= (old & PSR_AA32_N_BIT); new |= (old & PSR_AA32_Z_BIT); new |= (old & PSR_AA32_C_BIT); new |= (old & PSR_AA32_V_BIT); new |= (old & PSR_AA32_Q_BIT); // CPSR.IT[7:0] are set to zero upon any exception // See ARM DDI 0487E.a, section G1.12.3 // See ARM DDI 0406C.d, section B1.8.3 new |= (old & PSR_AA32_DIT_BIT); // CPSR.SSBS is set to SCTLR.DSSBS upon any exception // See ARM DDI 0487E.a, page G8-6244 if (sctlr & BIT(31)) new |= PSR_AA32_SSBS_BIT; // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented // See ARM DDI 0487E.a, page G8-6246 new |= (old & PSR_AA32_PAN_BIT); if (!(sctlr & BIT(23))) new |= PSR_AA32_PAN_BIT; // SS does not exist in AArch32, so ignore // CPSR.IL is set to zero upon any exception // See ARM DDI 0487E.a, page G1-5527 new |= (old & PSR_AA32_GE_MASK); // CPSR.IT[7:0] are set to zero upon any exception // See prior comment above // CPSR.E is set to SCTLR.EE upon any exception // See ARM DDI 0487E.a, page G8-6245 // See ARM DDI 0406C.d, page B4-1701 if (sctlr & BIT(25)) new |= PSR_AA32_E_BIT; // CPSR.A is unchanged upon an exception to Undefined, Supervisor // CPSR.A is set upon an exception to other modes // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 // See ARM DDI 0406C.d, page B1-1182 new |= (old & PSR_AA32_A_BIT); if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) new |= PSR_AA32_A_BIT; // CPSR.I is set upon any exception // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 // See ARM DDI 0406C.d, page B1-1182 new |= PSR_AA32_I_BIT; // CPSR.F is set upon an exception to FIQ // CPSR.F is unchanged upon an exception to other modes // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 // See ARM DDI 0406C.d, page B1-1182 new |= (old & PSR_AA32_F_BIT); if (mode == PSR_AA32_MODE_FIQ) new |= PSR_AA32_F_BIT; // CPSR.T is set to SCTLR.TE upon any exception // See ARM DDI 0487E.a, page G8-5514 // See ARM DDI 0406C.d, page B1-1181 if (sctlr & BIT(30)) new |= PSR_AA32_T_BIT; new |= mode; return new; } /* * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. */ static const u8 return_offsets[8][2] = { [0] = { 0, 0 }, /* Reset, unused */ [1] = { 4, 2 }, /* Undefined */ [2] = { 0, 0 }, /* SVC, unused */ [3] = { 4, 4 }, /* Prefetch abort */ [4] = { 8, 8 }, /* Data abort */ [5] = { 0, 0 }, /* HVC, unused */ [6] = { 4, 4 }, /* IRQ, unused */ [7] = { 4, 4 }, /* FIQ, unused */ }; static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { unsigned long spsr = *vcpu_cpsr(vcpu); bool is_thumb = (spsr & PSR_AA32_T_BIT); u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); u32 return_address; *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); return_address = *vcpu_pc(vcpu); return_address += return_offsets[vect_offset >> 2][is_thumb]; /* KVM only enters the ABT and UND modes, so only deal with those */ switch(mode) { case PSR_AA32_MODE_ABT: __vcpu_write_spsr_abt(vcpu, host_spsr_to_spsr32(spsr)); vcpu_gp_regs(vcpu)->compat_lr_abt = return_address; break; case PSR_AA32_MODE_UND: __vcpu_write_spsr_und(vcpu, host_spsr_to_spsr32(spsr)); vcpu_gp_regs(vcpu)->compat_lr_und = return_address; break; } /* Branch to exception vector */ if (sctlr & (1 << 13)) vect_offset += 0xffff0000; else /* always have security exceptions */ vect_offset += __vcpu_read_sys_reg(vcpu, VBAR_EL1); *vcpu_pc(vcpu) = vect_offset; } static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA32_UND): enter_exception32(vcpu, PSR_AA32_MODE_UND, 4); break; case unpack_vcpu_flag(EXCEPT_AA32_IABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12); break; case unpack_vcpu_flag(EXCEPT_AA32_DABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); break; default: /* Err... */ break; } } else { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync); break; case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ): enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq); break; default: /* * Only EL1_SYNC and EL2_{SYNC,IRQ} makes * sense so far. Everything else gets silently * ignored. */ break; } } } /* * Adjust the guest PC (and potentially exception state) depending on * flags provided by the emulation code. */ void __kvm_adjust_pc(struct kvm_vcpu *vcpu) { if (vcpu_get_flag(vcpu, PENDING_EXCEPTION)) { kvm_inject_exception(vcpu); vcpu_clear_flag(vcpu, PENDING_EXCEPTION); vcpu_clear_flag(vcpu, EXCEPT_MASK); } else if (vcpu_get_flag(vcpu, INCREMENT_PC)) { kvm_skip_instr(vcpu); vcpu_clear_flag(vcpu, INCREMENT_PC); } }
6 3 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 // SPDX-License-Identifier: GPL-2.0-only /* * IRQ offload/bypass manager * * Copyright (C) 2015 Red Hat, Inc. * Copyright (c) 2015 Linaro Ltd. * * Various virtualization hardware acceleration techniques allow bypassing or * offloading interrupts received from devices around the host kernel. Posted * Interrupts on Intel VT-d systems can allow interrupts to be received * directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical * interrupts to be directly deactivated by the guest. This manager allows * interrupt producers and consumers to find each other to enable this sort of * bypass. */ #include <linux/irqbypass.h> #include <linux/list.h> #include <linux/module.h> #include <linux/mutex.h> MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IRQ bypass manager utility module"); static LIST_HEAD(producers); static LIST_HEAD(consumers); static DEFINE_MUTEX(lock); /* @lock must be held when calling connect */ static int __connect(struct irq_bypass_producer *prod, struct irq_bypass_consumer *cons) { int ret = 0; if (prod->stop) prod->stop(prod); if (cons->stop) cons->stop(cons); if (prod->add_consumer) ret = prod->add_consumer(prod, cons); if (!ret) { ret = cons->add_producer(cons, prod); if (ret && prod->del_consumer) prod->del_consumer(prod, cons); } if (cons->start) cons->start(cons); if (prod->start) prod->start(prod); return ret; } /* @lock must be held when calling disconnect */ static void __disconnect(struct irq_bypass_producer *prod, struct irq_bypass_consumer *cons) { if (prod->stop) prod->stop(prod); if (cons->stop) cons->stop(cons); cons->del_producer(cons, prod); if (prod->del_consumer) prod->del_consumer(prod, cons); if (cons->start) cons->start(cons); if (prod->start) prod->start(prod); } /** * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure * * Add the provided IRQ producer to the list of producers and connect * with any matching token found on the IRQ consumers list. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer) { struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; int ret; if (!producer->token) return -EINVAL; might_sleep(); if (!try_module_get(THIS_MODULE)) return -ENODEV; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { if (tmp->token == producer->token) { ret = -EBUSY; goto out_err; } } list_for_each_entry(consumer, &consumers, node) { if (consumer->token == producer->token) { ret = __connect(producer, consumer); if (ret) goto out_err; break; } } list_add(&producer->node, &producers); mutex_unlock(&lock); return 0; out_err: mutex_unlock(&lock); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); /** * irq_bypass_unregister_producer - unregister IRQ bypass producer * @producer: pointer to producer structure * * Remove a previously registered IRQ producer from the list of producers * and disconnect it from any connected IRQ consumer. */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; if (!producer->token) return; might_sleep(); if (!try_module_get(THIS_MODULE)) return; /* nothing in the list anyway */ mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { if (tmp->token != producer->token) continue; list_for_each_entry(consumer, &consumers, node) { if (consumer->token == producer->token) { __disconnect(producer, consumer); break; } } list_del(&producer->node); module_put(THIS_MODULE); break; } mutex_unlock(&lock); module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); /** * irq_bypass_register_consumer - register IRQ bypass consumer * @consumer: pointer to consumer structure * * Add the provided IRQ consumer to the list of consumers and connect * with any matching token found on the IRQ producer list. */ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) { struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; int ret; if (!consumer->token || !consumer->add_producer || !consumer->del_producer) return -EINVAL; might_sleep(); if (!try_module_get(THIS_MODULE)) return -ENODEV; mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { if (tmp->token == consumer->token || tmp == consumer) { ret = -EBUSY; goto out_err; } } list_for_each_entry(producer, &producers, node) { if (producer->token == consumer->token) { ret = __connect(producer, consumer); if (ret) goto out_err; break; } } list_add(&consumer->node, &consumers); mutex_unlock(&lock); return 0; out_err: mutex_unlock(&lock); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); /** * irq_bypass_unregister_consumer - unregister IRQ bypass consumer * @consumer: pointer to consumer structure * * Remove a previously registered IRQ consumer from the list of consumers * and disconnect it from any connected IRQ producer. */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; if (!consumer->token) return; might_sleep(); if (!try_module_get(THIS_MODULE)) return; /* nothing in the list anyway */ mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { if (tmp != consumer) continue; list_for_each_entry(producer, &producers, node) { if (producer->token == consumer->token) { __disconnect(producer, consumer); break; } } list_del(&consumer->node); module_put(THIS_MODULE); break; } mutex_unlock(&lock); module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);
5 5 5 5 5 5 5 5 5 5 5 5 5 5 9 1 7 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 9 9 9 2 7 2 7 9 9 9 9 9 9 9 9 9 9 9 5 5 5 1 1 1 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 27 27 26 26 26 5 6 6 6 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ #include <linux/uaccess.h> #include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/sched/mm.h> #include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> #include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> #include <linux/dmaengine.h> #include <linux/err.h> #include <linux/ctype.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> #include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> #include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> #include <linux/phy_link_topology.h> #include "dev.h" #include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static inline void dev_base_seq_inc(struct net *net) { unsigned int val = net->dev_base_seq + 1; WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } #ifndef CONFIG_PREEMPT_RT static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); static int __init setup_backlog_napi_threads(char *arg) { static_branch_enable(&use_backlog_threads_key); return 0; } early_param("thread_backlog_napi", setup_backlog_napi_threads); static bool use_backlog_threads(void) { return static_branch_unlikely(&use_backlog_threads_key); } #else static bool use_backlog_threads(void) { return true; } #endif static inline void backlog_lock_irq_save(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else local_irq_save(*flags); } static inline void backlog_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else local_irq_disable(); } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else local_irq_restore(*flags); } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); if (!name_node) return NULL; INIT_HLIST_NODE(&name_node->hlist); name_node->dev = dev; name_node->name = name; return name_node; } static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { struct netdev_name_node *name_node; name_node = netdev_name_node_alloc(dev, dev->name); if (!name_node) return NULL; INIT_LIST_HEAD(&name_node->list); return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) { kfree(name_node); } static void netdev_name_node_add(struct net *net, struct netdev_name_node *name_node) { hlist_add_head_rcu(&name_node->hlist, dev_name_hash(net, name_node->name)); } static void netdev_name_node_del(struct netdev_name_node *name_node) { hlist_del_rcu(&name_node->hlist); } static struct netdev_name_node *netdev_name_node_lookup(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry_rcu(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } bool netdev_name_in_use(struct net *net, const char *name) { return netdev_name_node_lookup(net, name); } EXPORT_SYMBOL(netdev_name_in_use); int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (name_node) return -EEXIST; name_node = netdev_name_node_alloc(dev, name); if (!name_node) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } static void netdev_name_node_alt_free(struct rcu_head *head) { struct netdev_name_node *name_node = container_of(head, struct netdev_name_node, rcu); kfree(name_node->name); netdev_name_node_free(name_node); } static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); list_del(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; /* lookup might have found our primary name or a name belonging * to another device. */ if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; __netdev_name_node_alt_destroy(name_node); return 0; } static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { list_del(&name_node->list); netdev_name_node_alt_free(&name_node->rcu); } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); /* We reserved the ifindex, this can't fail */ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); xa_erase(&net->dev_by_index, dev->ifindex); netdev_for_each_altname(dev, name_node) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), }; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = { ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { int i; for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1; } static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { int i; i = netdev_lock_pos(dev_type); lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { int i; i = netdev_lock_pos(dev->type); lockdep_set_class_and_name(&dev->addr_list_lock, &netdev_addr_lock_key[i], netdev_lock_name[i]); } #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* * * Protocol management and registration routines * *******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803) */ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ void __dev_remove_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); struct packet_type *pt1; spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { list_del_rcu(&pt->list); goto out; } } pr_warn("dev_remove_pack: %p not found\n", pt); out: spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); /** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); synchronize_net(); } EXPORT_SYMBOL(dev_remove_pack); /******************************************************************************* * * Device Interface Subroutines * *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values. */ int dev_get_iflink(const struct net_device *dev) { if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); /** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info. */ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info; if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL; info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL; return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) return NULL; return &stack->path[k]; } int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack) { const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, }; struct net_device_path *path; int ret = 0; memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; path = dev_fwd_path(stack); if (!path) return -1; memset(path, 0, sizeof(struct net_device_path)); ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); if (ret < 0) return -1; if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } if (!ctx.dev) return ret; path = dev_fwd_path(stack); if (!path) return -1; path->type = DEV_PATH_ETHERNET; path->dev = ctx.dev; return ret; } EXPORT_SYMBOL_GPL(dev_fill_forward_path); /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi; return NULL; } /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct * netdev_napi_by_id(struct net *net, unsigned int napi_id) { struct napi_struct *napi; napi = napi_by_id(napi_id); if (!napi) return NULL; if (WARN_ON_ONCE(!napi->dev)) return NULL; if (!net_eq(net, dev_net(napi->dev))) return NULL; return napi; } /** * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it * @net: the applicable net namespace * @napi_id: ID of a NAPI of a target device * * Find a NAPI instance with @napi_id. Lock its device. * The device must be in %NETREG_REGISTERED state for lookup to succeed. * netdev_unlock() must be called to release it. * * Return: pointer to NAPI, its device with lock held, NULL if not found. */ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) { struct napi_struct *napi; struct net_device *dev; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { rcu_read_unlock(); return NULL; } dev = napi->dev; dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (!dev) return NULL; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (napi && napi->dev != dev) napi = NULL; rcu_read_unlock(); if (!napi) netdev_unlock(dev); return napi; } /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup_rcu(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); /* Deprecated for new users, call netdev_get_by_name() instead */ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); /** * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_name(net, name); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_index); /** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_index_rcu); /* Deprecated for new users, call netdev_get_by_index() instead */ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); /** * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * netdev_put() to indicate they have finished with it. */ struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); if (napi_id < MIN_NAPI_ID) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } /* Release the held reference on the net_device, and if the net_device * is still registered try to lock the instance lock. If device is being * unregistered NULL will be returned (but the reference has been released, * either way!) * * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ struct net_device *__netdev_put_lock(struct net_device *dev) { netdev_lock(dev); if (dev->reg_state > NETREG_REGISTERED) { netdev_unlock(dev); dev_put(dev); return NULL; } dev_put(dev); return dev; } /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. If a valid device * with @ifindex is found it will be returned with netdev->lock held. * netdev_unlock() must be called to release it. * * Return: pointer to a device with lock held, NULL if not found. */ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (!dev) return NULL; return __netdev_put_lock(dev); } struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) { if (dev) netdev_unlock(dev); do { rcu_read_lock(); dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); if (!dev) { rcu_read_unlock(); return NULL; } dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (dev) return dev; (*index)++; } while (true); } static DEFINE_SEQLOCK(netdev_rename_lock); void netdev_copy_name(struct net_device *dev, char *name) { unsigned int seq; do { seq = read_seqbegin(&netdev_rename_lock); strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; int ret; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { ret = -ENODEV; goto out; } netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); return ret; } /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU or RTNL. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * */ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; rcu_read_lock(); for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); ret = dev; break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); /** * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside * rtnl_lock(), and result refcount is unchanged. */ struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ASSERT_RTNL(); ret = NULL; for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; } } return ret; } EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } return true; } EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; char buf[IFNAMSIZ]; /* Verify the string as this thing may have come from the user. * There must be one "%d" and no other "%" characters. */ p = strchr(name, '%'); if (!p || p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; /* Use one page as a bit array of possible slots */ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM; for_each_netdev(net, d) { struct netdev_name_node *name_node; netdev_for_each_altname(d, name_node) { if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); bitmap_free(inuse); if (i == max_netdevices) return -ENFILE; /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ strscpy(buf, name, IFNAMSIZ); snprintf(res, IFNAMSIZ, buf, i); return i; } /* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ static int dev_prep_valid_name(struct net *net, struct net_device *dev, const char *want_name, char *out_name, int dup_errno) { if (!dev_valid_name(want_name)) return -EINVAL; if (strchr(want_name, '%')) return __dev_alloc_name(net, want_name, out_name); if (netdev_name_in_use(net, want_name)) return -dup_errno; if (out_name != want_name) strscpy(out_name, want_name, IFNAMSIZ); return 0; } /** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) { return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { int ret; ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); return ret < 0 ? ret : 0; } /** * dev_change_name - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ int dev_change_name(struct net_device *dev, const char *newname) { struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; ASSERT_RTNL_NET(net); if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; memcpy(oldname, dev->name, IFNAMSIZ); write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); write_sequnlock_bh(&netdev_rename_lock); if (err < 0) return err; if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } netdev_adjacent_rename_links(dev, oldname); netdev_name_node_del(dev->name_node); synchronize_net(); netdev_name_node_add(net, dev->name_node); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { netdev_err(dev, "name change rollback failed: %d\n", ret); } } return err; } /** * dev_set_alias - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device, */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; if (len) { new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); if (!new_alias) return -ENOMEM; memcpy(new_alias->ifalias, alias, len); new_alias->ifalias[len] = 0; } mutex_lock(&ifalias_mutex); new_alias = rcu_replace_pointer(dev->ifalias, new_alias, mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) kfree_rcu(new_alias, rcuhead); return len; } EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device. */ int dev_get_alias(const struct net_device *dev, char *name, size_t len) { const struct dev_ifalias *alias; int ret = 0; rcu_read_lock(); alias = rcu_dereference(dev->ifalias); if (alias) ret = snprintf(name, len, "%s", alias->ifalias); rcu_read_unlock(); return ret; } /** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); /** * netdev_state_change - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void __netdev_notify_peers(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } EXPORT_SYMBOL(__netdev_notify_peers); /** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); static int napi_threaded_poll(void *data); static int napi_kthread_create(struct napi_struct *n) { int err = 0; /* Create and wake up the kthread once to put it in * TASK_INTERRUPTIBLE mode to avoid the blocked task * warning and work with loadavg. */ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", n->dev->name, n->napi_id); if (IS_ERR(n->thread)) { err = PTR_ERR(n->thread); pr_err("kthread_run failed with err %d\n", err); n->thread = NULL; } return err; } static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ if (dev->dev.parent) pm_runtime_resume(dev->dev.parent); if (!netif_device_present(dev)) return -ENODEV; } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ netpoll_poll_disable(dev); ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; } /** * dev_open - prepare an interface for use. * @dev: device to open * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; ret = __dev_open(dev, extack); if (ret < 0) return ret; rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ netpoll_poll_disable(dev); call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ if (ops->ndo_stop) ops->ndo_stop(dev); netif_set_up(dev, false); netpoll_poll_enable(dev); } } static void __dev_close(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); __dev_close_many(&single); list_del(&single); } void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; /* Remove the devices that don't need to be closed */ list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) list_del_init(&dev->close_list); __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); } } EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_close_many(&single, true); list_del(&single); } } EXPORT_SYMBOL(dev_close); /** * dev_disable_lro - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); netdev_for_each_lower_dev(dev, lower_dev, iter) dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device. */ static void dev_disable_gro_hw(struct net_device *dev) { dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_GRO_HW)) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } const char *netdev_cmd_to_name(enum netdev_cmd cmd) { #define N(val) \ case NETDEV_##val: \ return "NETDEV_" __stringify(val); switch (cmd) { N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; } EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return nb->notifier_call(nb, val, &info); } static int call_netdevice_register_notifiers(struct notifier_block *nb, struct net_device *dev) { int err; err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) return err; if (!(dev->flags & IFF_UP)) return 0; call_netdevice_notifier(nb, NETDEV_UP, dev); return 0; } static void call_netdevice_unregister_notifiers(struct notifier_block *nb, struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } static int call_netdevice_register_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; int err; for_each_netdev(net, dev) { err = call_netdevice_register_notifiers(nb, dev); if (err) goto rollback; } return 0; rollback: for_each_netdev_continue_reverse(net, dev) call_netdevice_unregister_notifiers(nb, dev); return err; } static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; for_each_netdev(net, dev) call_netdevice_unregister_notifiers(nb, dev); } static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); __rtnl_net_unlock(net); if (err) goto rollback; } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; rollback: for_each_net_continue_reverse(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); static int __register_netdevice_notifier_net(struct net *net, struct notifier_block *nb, bool ignore_call_fail) { int err; err = raw_notifier_chain_register(&net->netdev_chain, nb); if (err) return err; if (dev_boot_phase) return 0; err = call_netdevice_register_net_notifiers(nb, net); if (err && !ignore_call_fail) goto chain_unregister; return 0; chain_unregister: raw_notifier_chain_unregister(&net->netdev_chain, nb); return err; } static int __unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; err = raw_notifier_chain_unregister(&net->netdev_chain, nb); if (err) return err; call_netdevice_unregister_net_notifiers(nb, net); return 0; } /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); /** * unregister_netdevice_notifier_net - unregister a per-netns * network notifier block * @net: network namespace * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); static void __move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, struct notifier_block *nb) { __unregister_netdevice_notifier_net(src_net, nb); __register_netdevice_notifier_net(dst_net, nb, true); } int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { struct net *net = dev_net(dev); int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { struct net *net = dev_net(dev); int err; rtnl_net_lock(net); list_del(&nn->list); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); static void move_netdevice_notifiers_dev_net(struct net_device *dev, struct net *net) { struct netdev_net_notifier *nn; list_for_each_entry(nn, &dev->net_notifier_list, list) __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); } /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; ASSERT_RTNL(); /* Run per-netns notifier block chain first, then run the global one. * Hopefully, one day, the global one is going to be removed after * all notifier block registrators get converted to be per-netns. */ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); if (ret & NOTIFY_STOP_MASK) return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers_info_robust - call per-netns notifier blocks * for and rollback on error * @val_up: value passed unmodified to notifier function * @val_down: value passed unmodified to the notifier function when * recovering from an error on @val_up * @info: notifier information data * * Call all per-netns network notifier blocks, but not notifier blocks on * the global notifier chain. Parameters and return value are as for * raw_notifier_call_chain_robust(). */ static int call_netdevice_notifiers_info_robust(unsigned long val_up, unsigned long val_down, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); ASSERT_RTNL(); return raw_notifier_call_chain_robust(&net->netdev_chain, val_up, val_down, info); } static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_info info = { .dev = dev, .extack = extack, }; return call_netdevice_notifiers_info(val, &info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); /** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_mtu(unsigned long val, struct net_device *dev, u32 arg) { struct netdev_notifier_info_ext info = { .info.dev = dev, .ext.mtu = arg, }; BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); return call_netdevice_notifiers_info(val, &info.info); } #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); int wanted; wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) static_branch_enable(&netstamp_needed_key); else static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 0) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) return; } atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 1) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) return; } atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, bool check_mtu) { int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } return ret; } int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, true); } EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation. */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, struct net_device *orig_dev, __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; if (pt_prev) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; } static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk) return true; return false; } /** * dev_nit_active - return true if any network interface taps are in use * * @dev: network device to check for the presence of taps */ bool dev_nit_active(struct net_device *dev) { return !list_empty(&net_hotdata.ptype_all) || !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (skb_loop_sk(ptype, skb)) continue; if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; } /* need to clone skb, done only once */ skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock; net_timestamp_set(skb2); /* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols. */ skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; pt_prev = ptype; } if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else kfree_skb(skb2); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues. */ static void netif_setup_tc(struct net_device *dev, unsigned int txq) { int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } /* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i); tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", i, q); netdev_set_prio_tc_map(dev, i, 0); } } } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) { if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } /* didn't find it, just return -1 to indicate no match */ return -1; } return 0; } EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static struct static_key xps_needed __read_mostly; static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; for (pos = map->len; pos--;) { if (map->queues[pos] != index) continue; if (map->len > 1) { map->queues[pos] = map->queues[--map->len]; break; } if (old_maps) RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } return true; } static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { int num_tc = dev_maps->num_tc; bool active = false; int tci; for (tci = cpu * num_tc; num_tc--; tci++) { int i, j; for (i = count, j = offset; i--; j++) { if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } active |= i < 0; } return active; } static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, enum xps_map_type type) { static_key_slow_dec_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_dec_cpuslocked(&xps_rxqs_needed); RCU_INIT_POINTER(dev->xps_maps[type], NULL); kfree_rcu(dev_maps, rcu); } static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, u16 offset, u16 count) { struct xps_dev_maps *dev_maps; bool active = false; int i, j; dev_maps = xmap_dereference(dev->xps_maps[type]); if (!dev_maps) return; for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, type); if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) clean_xps_maps(dev, XPS_RXQS, offset, count); clean_xps_maps(dev, XPS_CPUS, offset, count); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) { netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos; for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map; } /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; alloc_len = map->alloc_len * 2; } /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map */ if (is_rxqs_map) new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, cpu_to_node(attr_index)); if (!new_map) return NULL; for (i = 0; i < pos; i++) new_map->queues[i] = map->queues[i]; new_map->alloc_len = alloc_len; new_map->len = pos; return new_map; } /* Copy xps maps at a given index */ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, struct xps_dev_maps *new_dev_maps, int index, int tc, bool skip_tc) { int i, tci = index * dev_maps->num_tc; struct xps_map *map; /* copy maps belonging to foreign traffic classes */ for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && skip_tc) continue; /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; unsigned int nr_ids; WARN_ON_ONCE(index >= dev->num_tx_queues); if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps[type]); if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're * setting up now, as dev->num_tc or nr_ids could have been updated in * between. We could try to be smart, but let's be safe instead and only * copy foreign traffic classes if the two map sizes match. */ if (dev_maps && dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { if (!new_dev_maps) { new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { mutex_unlock(&xps_map_mutex); return -ENOMEM; } new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } for (j = 0; j < nr_ids; j++) { bool skip_tc = false; tci = j * num_tc + tc; if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; skip_tc = true; map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif } if (copy) xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) continue; if (copy) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); if (map == new_map) continue; } RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { tci = j * dev_maps->num_tc; for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && netif_attr_test_mask(j, mask, dev_maps->nr_ids) && netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; active |= remove_xps_queue(dev_maps, copy ? old_dev_maps : NULL, tci, index); } } if (old_dev_maps) kfree_rcu(old_dev_maps, rcu); /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); return 0; error: /* remove any maps that we added */ for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); } } mutex_unlock(&xps_map_mutex); kfree(new_dev_maps); return -ENOMEM; } EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { int ret; cpus_read_lock(); ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(netif_set_xps_queue); #endif static void netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); } EXPORT_SYMBOL(netdev_reset_tc); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) { if (tc >= dev->num_tc) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues(dev, offset, count); #endif dev->tc_to_txq[tc].count = count; dev->tc_to_txq[tc].offset = offset; return 0; } EXPORT_SYMBOL(netdev_set_tc_queue); int netdev_set_num_tc(struct net_device *dev, u8 num_tc) { if (num_tc > TC_MAX_QUEUE) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(sb_dev, 0); #endif memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev) txq->sb_dev = NULL; } } EXPORT_SYMBOL(netdev_unbind_sb_channel); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset) { /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL; /* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL; /* Record the mapping */ sb_dev->tc_to_txq[tc].count = count; sb_dev->tc_to_txq[tc].offset = offset; /* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself. */ while (count--) netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; return 0; } EXPORT_SYMBOL(netdev_bind_sb_channel_queue); int netdev_set_sb_channel(struct net_device *dev, u16 channel) { /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV; /* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel. */ if (channel > S16_MAX) return -EINVAL; dev->num_tc = -channel; return 0; } EXPORT_SYMBOL(netdev_set_sb_channel); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { bool disabling; int rc; disabling = txq < dev->real_num_tx_queues; if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); if (rc) return rc; if (dev->num_tc) netif_setup_tc(dev, txq); net_shaper_set_real_num_tx_queues(dev, txq); dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; if (disabling) { synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } } else { dev->real_num_tx_queues = txq; } return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); #ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) return rc; } dev->real_num_rx_queues = rxq; return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used * @dev: Network device * @txq: Actual number of TX queues * @rxq: Actual number of RX queues * * Set the real number of both TX and RX queues. * Does nothing if the number of queues is already correct. */ int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq) { unsigned int old_rxq = dev->real_num_rx_queues; int err; if (txq < 1 || txq > dev->num_tx_queues || rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; /* Start from increases, so the error path only does decreases - * decreases can't fail. */ if (rxq > dev->real_num_rx_queues) { err = netif_set_real_num_rx_queues(dev, rxq); if (err) return err; } if (txq > dev->real_num_tx_queues) { err = netif_set_real_num_tx_queues(dev, txq); if (err) goto undo_rx; } if (rxq < dev->real_num_rx_queues) WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); if (txq < dev->real_num_tx_queues) WARN_ON(netif_set_real_num_tx_queues(dev, txq)); return 0; undo_rx: WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); return err; } EXPORT_SYMBOL(netif_set_real_num_queues); /** * netif_set_tso_max_size() - set the max size of TSO frames supported * @dev: netdev to update * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. * Unless explicitly set the stack will assume the value of * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); if (size < READ_ONCE(dev->gso_ipv4_max_size)) netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); /** * netif_set_tso_max_segs() - set the max number of segs supported for TSO * @dev: netdev to update * @segs: max number of TCP segments * * Set the limit on the number of TCP segments the device can generate from * a single TSO super-frame. * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. */ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) { dev->tso_max_segs = segs; if (segs < READ_ONCE(dev->gso_max_segs)) netif_set_gso_max_segs(dev, segs); } EXPORT_SYMBOL(netif_set_tso_max_segs); /** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update * @from: netdev from which to copy the limits */ void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) { netif_set_tso_max_size(to, from->tso_max_size); netif_set_tso_max_segs(to, from->tso_max_segs); } EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { cpumask_var_t cpus; int cpu, count = 0; if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) return 1; cpumask_copy(cpus, cpu_online_mask); for_each_cpu(cpu, cpus) { ++count; cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); } free_cpumask_var(cpus); return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { enum skb_drop_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { return (struct dev_kfree_skb_cb *)skb->cb; } void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); } rcu_read_unlock(); } EXPORT_SYMBOL(netif_schedule_queue); void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { struct Qdisc *q; rcu_read_lock(); q = rcu_dereference(dev_queue->qdisc); __netif_schedule(q); rcu_read_unlock(); } } EXPORT_SYMBOL(netif_tx_wake_queue); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) { unsigned long flags; if (unlikely(!skb)) return; if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); refcount_set(&skb->users, 0); } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; local_irq_save(flags); skb->next = __this_cpu_read(softnet_data.completion_queue); __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(dev_kfree_skb_irq_reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (in_hardirq() || irqs_disabled()) dev_kfree_skb_irq_reason(skb, reason); else kfree_skb_reason(skb, reason); } EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available. */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); /** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed. */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ static u16 skb_tx_hash(const struct net_device *dev, const struct net_device *sb_dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; if (unlikely(!qcount)) { net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", sb_dev->name, qoffset, tc); qoffset = 0; qcount = dev->real_num_tx_queues; } } if (skb_rx_queue_recorded(skb)) { DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; if (!net_ratelimit()) return; if (dev) { if (dev->dev.parent) name = dev_driver_string(dev->dev.parent); else name = netdev_name(dev); } skb_dump(KERN_WARNING, skb, false); WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ int skb_checksum_help(struct sk_buff *skb) { __wsum csum; int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } if (!skb_frags_readable(skb)) { return -EFAULT; } /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (skb_has_shared_frag(skb)) { ret = __skb_linearize(skb); if (ret) goto out; } offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } EXPORT_SYMBOL(skb_checksum_help); int skb_crc32c_csum_help(struct sk_buff *skb) { __le32 crc32c_csum; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) goto out; if (unlikely(skb_is_gso(skb))) goto out; /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (unlikely(skb_has_shared_frag(skb))) { ret = __skb_linearize(skb); if (ret) goto out; } start = skb_checksum_start_offset(skb); offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ret = -EINVAL; goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__le32)); if (ret) goto out; crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb_reset_csum_not_inet(skb); out: return ret; } EXPORT_SYMBOL(skb_crc32c_csum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { struct ethhdr *eth; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; eth = (struct ethhdr *)skb->data; type = eth->h_proto; } return vlan_get_protocol_and_depth(skb, type, depth); } /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif /* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (page && PageHighMem(page)) return 1; } } #endif return 0; } /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; } #else static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { return features; } #endif static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { __be16 type; type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return features; } EXPORT_SYMBOL(passthru_features_check); static netdev_features_t dflt_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return vlan_features_check(skb, features); } static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { u16 gso_segs = skb_shinfo(skb)->gso_segs; if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; } /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame. */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; /* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) features &= ~NETIF_F_TSO_MANGLEID; } return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev */ if (skb->encapsulation) features &= dev->hw_enc_features; if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); else features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { unsigned int len; int rc; if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; } skb = next; if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } out: *ret = rc; return skb; } static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); if (features & NETIF_F_HW_CSUM) return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && skb_network_header_len(skb) != sizeof(struct ipv6hdr) && !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): return 0; } } sw_checksum: return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; skb = sk_validate_xmit_skb(skb, dev); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } skb = validate_xmit_xfrm(skb, features, again); return skb; out_kfree_skb: kfree_skb(skb); out_null: dev_core_stats_tx_dropped_inc(dev); return NULL; } struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; if (!head) head = skb; else tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb. */ tail = skb->prev; } return head; } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { u16 gso_segs = shinfo->gso_segs; unsigned int hdr_len; /* mac layer + network layer */ hdr_len = skb_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { int payload = skb->len - hdr_len; /* Malicious packet. */ if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, struct sk_buff **to_free, struct netdev_queue *txq) { int rc; rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; if (rc == NET_XMIT_SUCCESS) trace_qdisc_enqueue(q, txq, skb); return rc; } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); struct sk_buff *to_free = NULL; bool contended; int rc; qdisc_calculate_pkt_len(skb, q); tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing. */ if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); qdisc_run_end(q); goto no_lock_out; } qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && !nolock_qdisc_is_empty(q)) __qdisc_run(q); qdisc_run_end(q); return NET_XMIT_SUCCESS; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); return rc; } if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return NET_XMIT_DROP; } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit * and then other tasks will only enqueue packets. The packets will be * sent after the qdisc owner is scheduled again. To prevent this * scenario the task always serialize on the lock. */ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); } qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); qdisc_run_end(q); } } spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; } #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { const struct netprio_map *map; const struct sock *sk; unsigned int prioidx; if (skb->priority) return; map = rcu_dereference_bh(skb->dev->priomap); if (!map) return; sk = skb_to_full_sk(skb); if (!sk) return; prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = CHECKSUM_UNNECESSARY; DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS static struct netdev_queue * netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { int qm = skb_get_queue_mapping(skb); return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } #ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); } void netdev_xmit_skip_txqueue(bool skip) { __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #else static bool netdev_xmit_txqueue_skipped(void) { return current->net_xmit.skip_txqueue; } void netdev_xmit_skip_txqueue(bool skip) { current->net_xmit.skip_txqueue = skip; } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, enum skb_drop_reason *drop_reason) { int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); struct tcf_result res; if (!miniq) return ret; /* Global bypass */ if (!static_branch_likely(&tcf_sw_enabled_key)) return ret; /* Block-wise bypass */ if (tcf_block_bypass_sw(miniq->block)) return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(res.classid); break; } #endif /* CONFIG_NET_CLS_ACT */ return ret; } static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); void tcx_inc(void) { static_branch_inc(&tcx_needed_key); } void tcx_dec(void) { static_branch_dec(&tcx_needed_key); } static __always_inline enum tcx_action_base tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, const bool needs_mac) { const struct bpf_mprog_fp *fp; const struct bpf_prog *prog; int ret = TCX_NEXT; if (needs_mac) __skb_push(skb, skb->mac_len); bpf_mprog_foreach_prog(entry, fp, prog) { bpf_compute_data_pointers(skb); ret = bpf_prog_run(prog, skb); if (ret != TCX_NEXT) break; } if (needs_mac) __skb_pull(skb, skb->mac_len); return tcx_action_code(skb, ret); } static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } qdisc_skb_cb(skb)->pkt_len = skb->len; tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, true); if (sch_ret != TC_ACT_UNSPEC) goto ingress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); ingress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* skb_mac_header check was done by BPF, so we can safely * push the L2 header back before redirecting to another * netdev. */ __skb_push(skb, skb->mac_len); if (skb_do_redirect(skb) == -EAGAIN) { __skb_pull(skb, skb->mac_len); *another = true; break; } *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, false); if (sch_ret != TC_ACT_UNSPEC) goto egress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); egress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } #else static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { return skb; } #endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; else queue_index = map->queues[reciprocal_scale( skb_get_hash(skb), map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } return queue_index; } #endif static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } } rcu_read_unlock(); return queue_index; #else return -1; #endif } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); sb_dev = sb_dev ? : dev; if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); queue_index = new_index; } return queue_index; } EXPORT_SYMBOL(netdev_pick_tx); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { int queue_index = 0; #ifdef CONFIG_XPS u32 sender_cpu = skb->sender_cpu - 1; if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, sb_dev); else queue_index = netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } /** * __dev_queue_xmit() - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * Return: * * 0 - buffer successfully transmitted * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); qdisc_pkt_len_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { skb = nf_hook_egress(skb, &rc, dev); if (!skb) goto out; } netdev_xmit_skip_txqueue(false); nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); if (netdev_xmit_txqueue_skipped()) txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); if (!txq) txq = netdev_core_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels... * Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here. * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ /* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id. */ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); dev_xmit_recursion_dec(); local_bh_enable(); return ret; drop: dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ static DEFINE_PER_CPU(struct task_struct *, backlog_napi); int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { struct task_struct *thread; lockdep_assert_irqs_disabled(); if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); if (thread) { if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) goto use_local_napi; set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) __raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; u16 rxq_index; u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) goto out; rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out; rxqueue = dev->_rx + rxq_index; flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; WRITE_ONCE(rflow->filter, rc); if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); rps_input_queue_tail_save(&rflow->last_qtail, head); } WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue += index; } /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); if (!flow_table && !map) goto done; skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } try_rps: if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: return cpu; } #ifdef CONFIG_RFS_ACCEL /** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true. */ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } rcu_read_unlock(); return expire; } EXPORT_SYMBOL(rps_may_expire_flow); #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ static void trigger_rx_softirq(void *data) { struct softnet_data *sd = data; __raise_softirq_irqoff(NET_RX_SOFTIRQ); smp_store_release(&sd->defer_ipi_scheduled, 0); } /* * After we queued a packet into sd->input_pkt_queue, * we need to make sure this queue is serviced soon. * * - If this is another cpu queue, link it to our rps_ipi_list, * and make sure we will process rps_ipi_list from net_rx_action(). * * - If this is our own queue, NAPI schedule our backlog. * Note that this also raises NET_RX_SOFTIRQ. */ static void napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); #ifdef CONFIG_RPS if (sd != mysd) { if (use_backlog_threads()) { __napi_schedule_irqoff(&sd->backlog); return; } sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; /* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ. */ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) __raise_softirq_irqoff(NET_RX_SOFTIRQ); return; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); } void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) { unsigned long flags; if (use_backlog_threads()) { backlog_lock_irq_save(sd, &flags); if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); backlog_unlock_irq_restore(sd, &flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) { #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; struct softnet_data *sd; unsigned int old_flow, new_flow; if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; fl->history_head++; fl->history_head &= FLOW_LIMIT_HISTORY - 1; if (likely(fl->buckets[old_flow])) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++; rcu_read_unlock(); return true; } } rcu_read_unlock(); #endif return false; } /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; int max_backlog; u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); if (unlikely(qlen > max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; } static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_rx_queue *rxqueue; rxqueue = dev->_rx; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); return rxqueue; /* Return first rxqueue */ } rxqueue += index; } return rxqueue; } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; u32 metalen, act; int off; /* The XDP program wants to see the packet starting at the MAC * header. */ mac_len = skb->data - skb_mac_header(skb); hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); rxqueue = netif_get_rxqueue(skb); xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; if (off) { if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; skb_reset_network_header(skb); } /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ } /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || (orig_host != ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull * before calling us again on redirect path. We do not call do_redirect * as we leave that up to the caller. * * Caller is responsible for managing lifetime of skb (i.e. calling * kfree_skb in response to actions it cannot handle/XDP_DROP). */ switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; } return act; } static int netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) return 0; /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); troom = skb->tail + skb->data_len - skb->end; err = pskb_expand_head(skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC); if (err) return err; return skb_linearize(skb); } static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM * bytes. This is the guarantee that also native XDP provides, * thus we need to do it here as well. */ mac_len = skb->data - skb_mac_header(skb); __skb_push(skb, mac_len); if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } __skb_pull(*pskb, mac_len); act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: kfree_skb(*pskb); break; } return act; } /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. This also means * that XDP packets are able to starve other packets going through a qdisc, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc; txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; } HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: generic_xdp_tx(*pskb, xdp_prog); break; } bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: bpf_net_ctx_clear(bpf_net_ctx); kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** * __netif_rx - Slightly optimized version of netif_rx * @skb: buffer to post * * This behaves as netif_rx except that it does not disable bottom halves. * As a result this function may only be invoked from the interrupt context * (either hard or soft interrupt). */ int __netif_rx(struct sk_buff *skb) { int ret; lockdep_assert_once(hardirq_count() | softirq_count()); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); return ret; } EXPORT_SYMBOL(__netif_rx); /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; if (need_bh_off) local_bh_disable(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); if (need_bh_off) local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, get_kfree_skb_cb(skb)->reason); } } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); rcu_read_lock(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { /* There is a synchronize_net() between * STATE_DEACTIVATED flag being set and * qdisc_reset()/some_qdisc_is_busy() in * dev_deactivate(), so we can safely bail out * early here to avoid data race between * qdisc_deactivate() and some_qdisc_is_busy() * for lockless qdisc. */ clear_bit(__QDISC_STATE_SCHED, &q->state); continue; } clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } rcu_read_unlock(); } xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex. */ bool netdev_is_rx_handler_busy(struct net_device *dev) { ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler); } EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { if (netdev_is_rx_handler_busy(dev)) return -EBUSY; if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well. */ synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. */ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { if (nf_hook_ingress_active(skb)) { int ingress_retval; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } rcu_read_lock(); ingress_retval = nf_hook_ingress(skb); rcu_read_unlock(); return ingress_retval; } return 0; } static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); #if !defined(CONFIG_DEBUG_NET) /* We plan to no longer reset the transport header here. * Give some time to fuzzers and dev build to catch bugs * in network stacks. */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); #endif skb_reset_mac_len(skb); pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), &skb); migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } } if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } if (skb_skip_tc_classify(skb)) goto skip_classify; if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) goto another_round; if (!skb) goto out; nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } #endif skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; break; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. */ __vlan_hwaccel_clear_tag(skb); skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; if (vlan_do_receive(&skb)) /* After stripping off 802.1P header with vlan 0 * vlan dev is found for inner header. */ goto another_round; else if (unlikely(!skb)) goto out; else /* We have stripped outer 802.1P vlan 0 header. * But could not find vlan dev. * check again for vlan id to set OTHERHOST. */ goto check_vlan_id; } /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); } if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it. */ *pskb = skb; return ret; } static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; int ret; ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, skb->dev, pt_prev, orig_dev); return ret; } /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb_core(struct sk_buff *skb) { int ret; rcu_read_lock(); ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); static inline void __netif_receive_skb_list_ptype(struct list_head *head, struct packet_type *pt_prev, struct net_device *orig_dev) { struct sk_buff *skb, *next; if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL) INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) { /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet. */ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; skb_list_del_init(skb); __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_one_core(skb, false); return ret; } static void __netif_receive_skb_list(struct list_head *head) { unsigned long noreclaim_flag = 0; struct sk_buff *skb, *next; bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ list_for_each_entry_safe(skb, next, head, list) { if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { struct list_head sublist; /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) noreclaim_flag = memalloc_noreclaim_save(); else memalloc_noreclaim_restore(noreclaim_flag); } } /* Handle the remaining sublist */ if (!list_empty(head)) __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); } static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); if (old && !new) { static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } break; default: ret = -EINVAL; break; } return ret; } static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; } void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } list_splice_init(&sublist, head); rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { /* Will be handled, remove from list */ skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } #endif __netif_receive_skb_list(head); rcu_read_unlock(); } /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb(struct sk_buff *skb) { int ret; trace_netif_receive_skb_entry(skb); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); return ret; } EXPORT_SYMBOL(netif_receive_skb); /** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; if (list_empty(head)) return; if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); } netif_receive_skb_list_internal(head); trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; struct softnet_data *sd; local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); } static bool flush_required(int cpu) { #if IS_ENABLED(CONFIG_RPS) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); backlog_unlock_irq_enable(sd); return do_flush; #endif /* without RPS we can't safely check input_pkt_queue: during a * concurrent remote skb_queue_splice() we can detect as empty both * input_pkt_queue and process_queue even if the latter could end-up * containing a lot of packets. */ return true; } struct flush_backlogs { cpumask_t flush_cpus; struct work_struct w[]; }; static struct flush_backlogs *flush_backlogs_alloc(void) { return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), GFP_KERNEL); } static struct flush_backlogs *flush_backlogs_fallback; static DEFINE_MUTEX(flush_backlogs_mutex); static void flush_all_backlogs(void) { struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; if (!ptr) { mutex_lock(&flush_backlogs_mutex); ptr = flush_backlogs_fallback; } cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); for_each_online_cpu(cpu) { if (flush_required(cpu)) { INIT_WORK(&ptr->w[cpu], flush_backlog); queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of * them. */ for_each_cpu(cpu, &ptr->flush_cpus) flush_work(&ptr->w[cpu]); cpus_read_unlock(); if (ptr != flush_backlogs_fallback) kfree(ptr); else mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif } /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif } static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); if (++work >= quota) { rps_input_queue_head_add(sd, work); return work; } local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ napi->state &= NAPIF_STATE_THREADED; again = false; } else { local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } backlog_unlock_irq_enable(sd); } if (work) rps_input_queue_head_add(sd, work); return work; } /** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); /** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ bool napi_schedule_prep(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); do { if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED; */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return !(val & NAPIF_STATE_SCHED); } EXPORT_SYMBOL(napi_schedule_prep); /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ____napi_schedule(this_cpu_ptr(&softnet_data), n); else __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags, val, new, timeout = 0; bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) return false; if (work_done) { if (n->gro_bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); } gro_normal_list(n); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); list_del_init(&n->poll_list); local_irq_restore(flags); } WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc. */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; } while (!try_cmpxchg(&n->state, &val, new)); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; } if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); return ret; } EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; spin_unlock(&sd->defer_lock); while (skb != NULL) { next = skb->next; napi_consume_skb(skb, 1); skb = next; } } #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { gro_normal_list(napi); __napi_schedule(napi); return; } if (napi->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(napi, HZ >= 1000); } gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } enum { NAPI_F_PREFER_BUSY_POLL = 1, NAPI_F_END_ON_RESCHED = 2, }; static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; } } /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } static void __napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); restart: napi_poll = NULL; napi = napi_by_id(napi_id); if (!napi) return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); for (;;) { int work = 0; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); /* If multiple threads are competing for this napi, * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { if (flags & NAPI_F_END_ON_RESCHED) break; if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; } cpu_relax(); } if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = NAPI_F_END_ON_RESCHED; if (prefer_busy_poll) flags |= NAPI_F_PREFER_BUSY_POLL; __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; rcu_read_lock(); __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); void napi_suspend_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { unsigned long timeout = napi_get_irq_suspend_timeout(napi); if (timeout) hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } rcu_read_unlock(); } void napi_resume_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { /* If irq_suspend_timeout is set to 0 between the call to * napi_suspend_irqs and now, the original value still * determines the safety timeout as intended and napi_watchdog * will resume irq processing. */ if (napi_get_irq_suspend_timeout(napi)) { local_bh_disable(); napi_schedule(napi); local_bh_enable(); } } rcu_read_unlock(); } #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); WARN_ON_ONCE(napi_by_id(napi_id)); __napi_hash_add_with_id(napi, napi_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } static void napi_hash_add(struct napi_struct *napi) { unsigned long flags; if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ static void napi_hash_del(struct napi_struct *napi) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); } return HRTIMER_NORESTART; } static void init_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; } napi->gro_bitmask = 0; } int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; netdev_assert_locked_or_invisible(dev); if (dev->threaded == threaded) return 0; if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { threaded = false; break; } } } } WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. */ smp_mb__before_atomic(); /* Setting/unsetting threaded mode on a napi might not immediately * take effect, if the current napi instance is actively being * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ list_for_each_entry(napi, &dev->napi_list, dev_list) assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } EXPORT_SYMBOL(dev_set_threaded); /** * netif_queue_set_napi - Associate queue with the napi * @dev: device to which NAPI and queue belong * @queue_index: Index of queue * @type: queue type as RX or TX * @napi: NAPI context, pass NULL to clear previously set NAPI * * Set queue with its corresponding napi context. This should be done after * registering the NAPI handler for the queue-vector and the queues have been * mapped to the corresponding interrupt vector. */ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; if (WARN_ON_ONCE(napi && !napi->dev)) return; if (dev->reg_state >= NETREG_REGISTERED) ASSERT_RTNL(); switch (type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(dev, queue_index); rxq->napi = napi; return; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(dev, queue_index); txq->napi = napi; return; default: return; } } EXPORT_SYMBOL(netif_queue_set_napi); static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ if (n->config->napi_id) { napi_hash_add_with_id(n, n->config->napi_id); } else { napi_hash_add(n); n->config->napi_id = n->napi_id; } } static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; napi_hash_del(n); } /* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will * inherit an existing ID try to insert it at the right position. */ static void netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) { unsigned int new_id, pos_id; struct list_head *higher; struct napi_struct *pos; new_id = UINT_MAX; if (napi->config && napi->config->napi_id) new_id = napi->config->napi_id; higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { if (pos->napi_id >= MIN_NAPI_ID) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; else pos_id = UINT_MAX; if (pos_id <= new_id) break; higher = &pos->dev_list; } list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, weight); napi->weight = weight; napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); netif_napi_dev_list_add(dev, napi); /* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable */ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = false; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); netdev_assert_locked(n->dev); set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); do { while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); val = READ_ONCE(n->state); } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); if (n->config) napi_save_config(n); else napi_hash_del(n); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable_locked); /** * napi_disable() - prevent NAPI from scheduling * @n: NAPI context * * Stop NAPI from being scheduled on this context. * Waits till any outstanding processing completes. * Takes netdev_lock() for associated net_device. */ void napi_disable(struct napi_struct *n) { netdev_lock(n->dev); napi_disable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_disable); void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); if (n->config) napi_restore_config(n); else napi_hash_add(n); do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable_locked); /** * napi_enable() - enable NAPI scheduling * @n: NAPI context * * Enable scheduling of a NAPI instance. * Must be paired with napi_disable(). * Takes netdev_lock() for associated net_device. */ void napi_enable(struct napi_struct *n) { netdev_lock(n->dev); napi_enable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_enable); static void flush_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); napi->gro_hash[i].count = 0; } } /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { netdev_assert_locked(napi->dev); if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; if (napi->config) { napi->index = -1; napi->config = NULL; } list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { int work, weight; weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (napi_is_scheduled(n)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); xdp_do_check_flushed(n); } if (unlikely(work > weight)) netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", n->poll, work, weight); if (likely(work < weight)) return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); return work; } /* The NAPI context has more processing work, but busy-polling * is preferred. Exit early. */ if (napi_prefer_busy_poll(n)) { if (napi_complete_done(n, work)) { /* If timeout is not set, we need to make sure * that the NAPI is re-scheduled. */ napi_schedule(n); } return work; } if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(n, HZ >= 1000); } gro_normal_list(n); /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); return work; } *repoll = true; return work; } static int napi_poll(struct napi_struct *n, struct list_head *repoll) { bool do_repoll = false; void *have; int work; list_del_init(&n->poll_list); have = netpoll_poll_lock(n); work = __napi_poll(n, &do_repoll); if (do_repoll) list_add_tail(&n->poll_list, repoll); netpoll_poll_unlock(have); return work; } static int napi_thread_wait(struct napi_struct *napi) { set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { /* Testing SCHED_THREADED bit here to make sure the current * kthread owns this napi and could poll on this napi. * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return -1; } static void napi_threaded_poll_loop(struct napi_struct *napi) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; unsigned long last_qs = jiffies; for (;;) { bool repoll = false; void *have; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); sd->in_napi_threaded_poll = false; barrier(); if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!repoll) break; rcu_softirq_qs_periodic(last_qs); cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; while (!napi_thread_wait(napi)) napi_threaded_poll_loop(napi); return 0; } static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; skb_defer_free_flush(sd); if (list_empty(&list)) { if (list_empty(&repoll)) { sd->in_net_rx_action = false; barrier(); /* We need to check if ____napi_schedule() * had refilled poll_list while * sd->in_net_rx_action was true. */ if (!list_empty(&sd->poll_list)) goto start; if (!sd_has_rps_ipi_waiting(sd)) goto end; } break; } n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); else sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); end: bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { struct net_device *dev; netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; /* lookup ignore flag */ bool ignore; /* counter for the number of times this device was added to us */ u16 ref_nr; /* private field for the users */ void *private; struct list_head list; struct rcu_head rcu; }; static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } static int ____netdev_has_upper_dev(struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock. */ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock. */ bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock. */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master) && !upper->ignore) return upper->dev; return NULL; } /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock. */ static bool netdev_has_any_lower_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.lower); } void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; adj = list_entry(adj_list, struct netdev_adjacent, list); return adj->private; } EXPORT_SYMBOL(netdev_adjacent_get_private); /** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); static struct net_device *__netdev_next_upper_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *upper; upper = list_entry((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; *ignore = upper->ignore; return upper->dev; } static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = __netdev_next_upper_dev(now, &iter, &ignore); if (!udev) break; if (ignore) continue; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev_rcu(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = 0, .data = (void *)upper_dev, }; ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, &priv); } /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private); /** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock. */ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->dev; } EXPORT_SYMBOL(netdev_lower_get_next); static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } static struct net_device *__netdev_next_lower_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; *ignore = lower->ignore; return lower->dev; } int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = __netdev_next_lower_dev(now, &iter, &ignore); if (!ldev) break; if (ignore) continue; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { struct net_device *udev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.upper, udev = __netdev_next_upper_dev(dev, &iter, &ignore); udev; udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < udev->upper_level) max_depth = udev->upper_level; } return max_depth; } static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.lower, ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < ldev->lower_level) max_depth = ldev->lower_level; } return max_depth; } static int __netdev_update_upper_level(struct net_device *dev, struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); static void net_unlink_todo(struct net_device *dev) { if (list_empty(&dev->unlink_list)) list_add_tail(&dev->unlink_list, &net_unlink_list); } #endif static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; #ifdef CONFIG_LOCKDEP if (!priv) return 0; if (priv->flags & NESTED_SYNC_IMM) dev->nested_level = dev->lower_level - 1; if (priv->flags & NESTED_SYNC_TODO) net_unlink_todo(dev); #endif return 0; } int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock. */ void *netdev_lower_get_first_private_rcu(struct net_device *dev) { struct netdev_adjacent *lower; lower = list_first_or_null_rcu(&dev->adj_list.lower, struct netdev_adjacent, list); if (lower) return lower->private; return NULL; } EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock. */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); } static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { return (dev_list == &dev->adj_list.upper || dev_list == &dev->adj_list.lower) && net_eq(dev_net(dev), dev_net(adj_dev)); } static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; int ret; adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr += 1; pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", dev->name, adj_dev->name, adj->ref_nr); return 0; } adj = kmalloc(sizeof(*adj), GFP_KERNEL); if (!adj) return -ENOMEM; adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; adj->private = private; adj->ignore = false; netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { list_add_tail_rcu(&adj->list, dev_list); } return 0; remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: netdev_put(adj_dev, &adj->dev_tracker); kfree(adj); return ret; } static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", dev->name, adj_dev->name, ref_nr); adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); WARN_ON(1); return; } if (adj->ref_nr > ref_nr) { pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", dev->name, adj_dev->name, ref_nr, adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); netdev_put(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, .extack = extack, }, .upper_dev = upper_dev, .master = master, .linking = true, .upper_info = upper_info, }; struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); if (dev == upper_dev) return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) return -EMLINK; if (!master) { if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) return ret; ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) goto rollback; __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); return 0; rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); /** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, }, .upper_dev = upper_dev, .linking = false, }; ASSERT_RTNL(); changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); } /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock. */ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_TODO, .data = NULL, }; __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); static void __netdev_adjacent_dev_set(struct net_device *upper_dev, struct net_device *lower_dev, bool val) { struct netdev_adjacent *adj; adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); if (adj) adj->ignore = val; adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); if (adj) adj->ignore = val; } static void netdev_adjacent_dev_disable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, true); } static void netdev_adjacent_dev_enable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, false); } int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; int err; if (!new_dev) return 0; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); return err; } return 0; } EXPORT_SYMBOL(netdev_adjacent_change_prepare); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; if (!new_dev || !old_dev) return; if (new_dev == old_dev) return; netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { struct netdev_notifier_bonding_info info = { .info.dev = dev, }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); static int netdev_offload_xstats_enable_l3(struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; int err; int rc; dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), GFP_KERNEL); if (!dev->offload_xstats_l3) return -ENOMEM; rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); err = notifier_to_errno(rc); if (err) goto free_stats; return 0; free_stats: kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; return err; } int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return netdev_offload_xstats_enable_l3(dev, extack); } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_enable); static void netdev_offload_xstats_disable_l3(struct net_device *dev) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; } int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); if (!netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: netdev_offload_xstats_disable_l3(dev); return 0; } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_disable); static void netdev_offload_xstats_disable_all(struct net_device *dev) { netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); } static struct rtnl_hw_stats64 * netdev_offload_xstats_get_ptr(const struct net_device *dev, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return dev->offload_xstats_l3; } WARN_ON(1); return NULL; } bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); return netdev_offload_xstats_get_ptr(dev, type); } EXPORT_SYMBOL(netdev_offload_xstats_enabled); struct netdev_notifier_offload_xstats_ru { bool used; }; struct netdev_notifier_offload_xstats_rd { struct rtnl_hw_stats64 stats; bool used; }; static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, const struct rtnl_hw_stats64 *src) { dest->rx_packets += src->rx_packets; dest->tx_packets += src->tx_packets; dest->rx_bytes += src->rx_bytes; dest->tx_bytes += src->tx_bytes; dest->rx_errors += src->rx_errors; dest->tx_errors += src->tx_errors; dest->rx_dropped += src->rx_dropped; dest->tx_dropped += src->tx_dropped; dest->multicast += src->multicast; } static int netdev_offload_xstats_get_used(struct net_device *dev, enum netdev_offload_xstats_type type, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_ru report_used = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_used = &report_used, }; int rc; WARN_ON(!netdev_offload_xstats_enabled(dev, type)); rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, &info.info); *p_used = report_used.used; return notifier_to_errno(rc); } static int netdev_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_rd report_delta = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_delta = &report_delta, }; struct rtnl_hw_stats64 *stats; int rc; stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return -EINVAL; rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, &info.info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ netdev_hw_stats64_add(stats, &report_delta.stats); if (p_stats) *p_stats = *stats; *p_used = report_delta.used; return notifier_to_errno(rc); } int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (p_stats) return netdev_offload_xstats_get_stats(dev, type, p_stats, p_used, extack); else return netdev_offload_xstats_get_used(dev, type, p_used, extack); } EXPORT_SYMBOL(netdev_offload_xstats_get); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, const struct rtnl_hw_stats64 *stats) { report_delta->used = true; netdev_hw_stats64_add(&report_delta->stats, stats); } EXPORT_SYMBOL(netdev_offload_xstats_report_delta); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) { report_used->used = true; } EXPORT_SYMBOL(netdev_offload_xstats_report_used); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *p_stats) { struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return; netdev_hw_stats64_add(stats, p_stats); } EXPORT_SYMBOL(netdev_offload_xstats_push_delta); /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device * @skb: The packet * @all_slaves: assume all the slaves are active * * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. * %NULL is returned if no slave is found. */ struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_xmit_slave) return NULL; return ops->ndo_get_xmit_slave(dev, skb, all_slaves); } EXPORT_SYMBOL(netdev_get_xmit_slave); static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_sk_get_lower_dev) return NULL; return ops->ndo_sk_get_lower_dev(dev, sk); } /** * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket * @dev: device * @sk: the socket * * %NULL is returned if no lower device is found. */ struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk) { struct net_device *lower; lower = netdev_sk_get_lower_dev(dev, sk); while (lower) { dev = lower; lower = netdev_sk_get_lower_dev(dev, sk); } return dev; } EXPORT_SYMBOL(netdev_sk_get_lowest_dev); static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.lower); } } static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.lower); } } void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); } } void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { struct netdev_adjacent *lower; if (!lower_dev) return NULL; lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; return lower->private; } EXPORT_SYMBOL(netdev_lower_dev_get_private); /** * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { struct netdev_notifier_changelowerstate_info changelowerstate_info = { .info.dev = lower_dev, }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); promiscuity = dev->promiscuity + inc; if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_PROMISC; } else { flags = old_flags | IFF_PROMISC; } WRITE_ONCE(dev->promiscuity, promiscuity); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); return 0; } /** * dev_set_promiscuity - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; ASSERT_RTNL(); allmulti = dev->allmulti + inc; if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_ALLMULTI; } else { flags = old_flags | IFF_ALLMULTI; } WRITE_ONCE(dev->allmulti, allmulti); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, dev->gflags ^ old_gflags, 0, NULL); } return 0; } /** * dev_set_allmulti - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * Return 0 if successful or a negative errno code on error. */ int dev_set_allmulti(struct net_device *dev, int inc) { return __dev_set_allmulti(dev, inc, true); } EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present. */ void __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; if (!netif_device_present(dev)) return; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) { netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); } /** * dev_get_flags - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace. */ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { if (netif_oper_up(dev)) flags |= IFF_RUNNING; if (netif_carrier_ok(dev)) flags |= IFF_LOWER_UP; if (netif_dormant(dev)) flags |= IFF_DORMANT; } return flags; } EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP) __dev_close(dev); else ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); else call_netdevice_notifiers(NETDEV_DOWN, dev); } if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { struct netdev_notifier_change_info change_info = { .info = { .dev = dev, }, .flags_changed = changes, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } /** * dev_change_flags - change device settings * @dev: device * @flags: device state flags * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); /* Pairs with all the lockless reads of dev->mtu in the stack */ WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } return 0; } /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; if (new_mtu == dev->mtu) return 0; err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err; if (!netif_device_present(dev)) return -ENODEV; err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); err = notifier_to_errno(err); if (err) return err; orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } } return err; } int dev_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_change_tx_queue_len - Change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; if (new_len != (unsigned int)new_len) return -ERANGE; if (new_len != orig_len) { WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) goto err_rollback; res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback; } return 0; err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } /** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. * @dev: device * @addr: new address * @extack: netlink extended ack */ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, .info.extack = extack, .dev_addr = addr, }; int rc; rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } EXPORT_SYMBOL(dev_pre_changeaddr_notify); /** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { err = ops->ndo_set_mac_address(dev, sa); if (err) return err; } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); ret = dev_set_mac_address(dev, sa, extack); up_write(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; down_read(&dev_addr_sem); rcu_read_lock(); dev = dev_get_by_name_rcu(net, dev_name); if (!dev) { ret = -ENODEV; goto unlock; } if (!dev->addr_len) memset(sa->sa_data, 0, size); else memcpy(sa->sa_data, dev->dev_addr, min_t(size_t, size, dev->addr_len)); sa->sa_family = dev->type; unlock: rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_get_mac_address); /** * dev_change_carrier - Change device carrier * @dev: device * @new_carrier: new value * * Change device carrier */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } /** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } /** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name */ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (ops->ndo_get_phys_port_name) { err = ops->ndo_get_phys_port_name(dev, name, len); if (err != -EOPNOTSUPP) return err; } return devlink_compat_phys_port_name_get(dev, name, len); } /** * dev_get_port_parent_id - Get the device's port parent identifier * @dev: network device * @ppid: pointer to a storage for the port's parent identifier * @recurse: allow/disallow recursion to lower devices * * Get the devices's port parent identifier */ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; int err; if (ops->ndo_get_port_parent_id) { err = ops->ndo_get_port_parent_id(dev, ppid); if (err != -EOPNOTSUPP) return err; } err = devlink_compat_switch_id_get(dev, ppid); if (!recurse || err != -EOPNOTSUPP) return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) first = *ppid; else if (memcmp(&first, ppid, sizeof(*ppid))) return -EOPNOTSUPP; } return err; } EXPORT_SYMBOL(dev_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have * the same port parent identifier * @a: first network device * @b: second network device */ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; if (dev_get_port_parent_id(a, &a_id, true) || dev_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); } EXPORT_SYMBOL(netdev_port_same_parent_id); /** * dev_change_proto_down - set carrier according to proto_down. * * @dev: device * @proto_down: new value */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; if (proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); WRITE_ONCE(dev->proto_down, proto_down); return 0; } /** * dev_change_proto_down_reason - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { u32 proto_down_reason; int b; if (!mask) { proto_down_reason = value; } else { proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) proto_down_reason |= BIT(b); else proto_down_reason &= ~BIT(b); } } WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { struct bpf_link link; struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ int flags; }; static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { if (flags & XDP_FLAGS_HW_MODE) return XDP_MODE_HW; if (flags & XDP_FLAGS_DRV_MODE) return XDP_MODE_DRV; if (flags & XDP_FLAGS_SKB_MODE) return XDP_MODE_SKB; return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { switch (mode) { case XDP_MODE_SKB: return generic_xdp_install; case XDP_MODE_DRV: case XDP_MODE_HW: return dev->netdev_ops->ndo_bpf; default: return NULL; } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, enum bpf_xdp_mode mode) { return dev->xdp_state[mode].link; } static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_xdp_link *link = dev_xdp_link(dev, mode); if (link) return link->link.prog; return dev->xdp_state[mode].prog; } u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog || dev->xdp_state[i].link) count++; return count; } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u8 dev_xdp_sb_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog && !dev->xdp_state[i].prog->aux->xdp_has_frags) count++; return count; } int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && bpf->command == XDP_SETUP_PROG && bpf->prog && !bpf->prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); return -EBUSY; } return dev->netdev_ops->ndo_bpf(dev, bpf); } EXPORT_SYMBOL_GPL(dev_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); return prog ? prog->aux->id : 0; } static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_xdp_link *link) { dev->xdp_state[mode].link = link; dev->xdp_state[mode].prog = NULL; } static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_bpf xdp; int err; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); return -EBUSY; } memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; /* Drivers assume refcnt is already incremented (i.e, prog pointer is * "moved" into driver), so they don't increment it on their own, but * they do decrement refcnt when program is detached or replaced. * Given net_device also owns link/prog, we need to bump refcnt here * to prevent drivers from underflowing it. */ if (prog) bpf_prog_inc(prog); err = bpf_op(dev, &xdp); if (err) { if (prog) bpf_prog_put(prog); return err; } if (mode != XDP_MODE_HW) bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); return 0; } static void dev_xdp_uninstall(struct net_device *dev) { struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { prog = dev_xdp_prog(dev, mode); if (!prog) continue; bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) continue; WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); /* auto-detach link from net device */ link = dev_xdp_link(dev, mode); if (link) link->dev = NULL; else bpf_prog_put(prog); dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; struct net_device *upper; struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; ASSERT_RTNL(); /* either link or prog attachment, never both */ if (link && (new_prog || old_prog)) return -EINVAL; /* link supports only XDP mode flags */ if (link && (flags & ~XDP_FLAGS_MODES)) { NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ if (!num_modes && dev_xdp_prog_count(dev) > 1) { NL_SET_ERR_MSG(extack, "More than one program loaded, unset mode is ambiguous"); return -EINVAL; } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); return -EINVAL; } mode = dev_xdp_mode(dev, flags); /* can't replace attached link */ if (dev_xdp_link(dev, mode)) { NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); return -EBUSY; } /* don't allow if an upper device already has a program */ netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (dev_xdp_prog_count(upper) > 0) { NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); return -EEXIST; } } cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); return -EBUSY; } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; } /* put effective new program into new_prog */ if (link) new_prog = link->link.prog; if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ? XDP_MODE_DRV : XDP_MODE_SKB; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; } if (!offload && dev_xdp_prog(dev, other_mode)) { NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); return -EINVAL; } } /* don't call drivers if the effective program didn't change */ if (new_prog != cur_prog) { bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) { NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; } err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); if (err) return err; } if (link) dev_xdp_set_link(dev, mode, link); else dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } static int dev_xdp_attach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); } static int dev_xdp_detach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); mode = dev_xdp_mode(dev, link->flags); if (dev_xdp_link(dev, mode) != link) return -EINVAL; bpf_op = dev_xdp_bpf_op(dev, mode); WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); dev_xdp_set_link(dev, mode, NULL); return 0; } static void bpf_xdp_link_release(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); rtnl_lock(); /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); xdp_link->dev = NULL; } rtnl_unlock(); } static int bpf_xdp_link_detach(struct bpf_link *link) { bpf_xdp_link_release(link); return 0; } static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); kfree(xdp_link); } static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); } static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); info->xdp.ifindex = ifindex; return 0; } static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err = 0; rtnl_lock(); /* link might have been auto-released already, so fail */ if (!xdp_link->dev) { err = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { err = -EPERM; goto out_unlock; } old_prog = link->prog; if (old_prog->type != new_prog->type || old_prog->expected_attach_type != new_prog->expected_attach_type) { err = -EINVAL; goto out_unlock; } if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); goto out_unlock; } mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); if (err) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: rtnl_unlock(); return err; } static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; rtnl_lock(); dev = dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { rtnl_unlock(); return -EINVAL; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); link->dev = dev; link->flags = attr->link_create.flags; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto unlock; } err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } fd = bpf_link_settle(&link_primer); /* link itself doesn't hold dev's refcnt to not complicate shutdown */ dev_put(dev); return fd; unlock: rtnl_unlock(); out_put_dev: dev_put(dev); return err; } /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags) { enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); struct bpf_prog *new_prog = NULL, *old_prog = NULL; int err; ASSERT_RTNL(); if (fd >= 0) { new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(new_prog)) return PTR_ERR(new_prog); } if (expected_fd >= 0) { old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(old_prog)) { err = PTR_ERR(old_prog); old_prog = NULL; goto err_out; } } err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) bpf_prog_put(new_prog); if (old_prog) bpf_prog_put(old_prog); return err; } u32 dev_get_min_mp_channel_count(const struct net_device *dev) { int i; ASSERT_RTNL(); for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) /* The channel count is the idx plus 1. */ return i + 1; return 0; } /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace * @ifindex: requested ifindex, pass %0 to get one allocated * * Allocate a ifindex for a new device. Caller must either use the ifindex * to store the device (via list_netdevice()) or call dev_index_release() * to give the index up. * * Return: a suitable unique value for a new device interface number or -errno. */ static int dev_index_reserve(struct net *net, u32 ifindex) { int err; if (ifindex > INT_MAX) { DEBUG_NET_WARN_ON_ONCE(1); return -EINVAL; } if (!ifindex) err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, xa_limit_31b, &net->ifindex, GFP_KERNEL); else err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); if (err < 0) return err; return ifindex; } static void dev_index_release(struct net *net, int ifindex) { /* Expect only unused indexes, unlist_netdevice() removes the used */ WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } static bool from_cleanup_net(void) { #ifdef CONFIG_NET_NS return current == cleanup_net_task; #else return false; #endif } /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", &feature, upper->name); features &= ~feature; } } return features; } static void netdev_sync_lower_features(struct net_device *upper, struct net_device *lower, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) {